Skip to content

Commit

Permalink
MRG: add capacity for skipmer sketching and search (#531)
Browse files Browse the repository at this point in the history
* working skipmers, i think!

* test singlesketch skipmers

* add skipmer as moltype option for other commands

* black formatting

* add skipmer test sigs; add fastgather test

* upd

* upd test files

* upd

* fix

* fix2

* try not pinning

* MRG: switch to `importlib.metadata` from `pkg_resources` (#546)

* switch from pkg_resources to importlib.metadata

* black

* need parens for proper selection

* add anydna_size

* update to sourmash prospective v0.18.0

* remove tests for bad zip files 😅

* upd to use py-skipmers

* switch to sourmash latest; add skipmer indexing test

* black formatting

* upd tests

* test for intersect_manifest bug

* fix lint

* fmt python

* cargo fmt

* remove redundant test

* bump to sourmash r0.18.0

---------

Co-authored-by: C. Titus Brown <titus@idyll.org>
  • Loading branch information
bluegenes and ctb authored Dec 21, 2024
1 parent 992bf48 commit 6a3c49a
Show file tree
Hide file tree
Showing 9 changed files with 349 additions and 18 deletions.
26 changes: 13 additions & 13 deletions src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ def __init__(self, p):
"-m",
"--moltype",
default="DNA",
choices=["DNA", "protein", "dayhoff", "hp"],
help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
)
p.add_argument(
"-c",
Expand Down Expand Up @@ -187,8 +187,8 @@ def __init__(self, p):
"-m",
"--moltype",
default="DNA",
choices=["DNA", "protein", "dayhoff", "hp"],
help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
)
p.add_argument(
"-c",
Expand Down Expand Up @@ -264,8 +264,8 @@ def __init__(self, p):
"-m",
"--moltype",
default="DNA",
choices=["DNA", "protein", "dayhoff", "hp"],
help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
)
p.add_argument(
"-c",
Expand Down Expand Up @@ -348,8 +348,8 @@ def __init__(self, p):
"-m",
"--moltype",
default="DNA",
choices=["DNA", "protein", "dayhoff", "hp"],
help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
)
p.add_argument(
"-c",
Expand Down Expand Up @@ -450,8 +450,8 @@ def __init__(self, p):
"-m",
"--moltype",
default="DNA",
choices=["DNA", "protein", "dayhoff", "hp"],
help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
)
p.add_argument(
"-c",
Expand Down Expand Up @@ -544,8 +544,8 @@ def __init__(self, p):
"-m",
"--moltype",
default="DNA",
choices=["DNA", "protein", "dayhoff", "hp"],
help="molecule type (DNA, protein, dayhoff, or hp; default DNA)",
choices=["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"],
help="molecule type: DNA, protein, dayhoff, hp, or skipmer (skipm1n3 or skipm2n3); default DNA",
)
p.add_argument(
"-c",
Expand Down Expand Up @@ -640,7 +640,7 @@ def main(self, args):
args.param_string = ["k=31,scaled=1000,dna"]

# Check and append 'dna' if no moltype is found in a param string
moltypes = ["dna", "protein", "dayhoff", "hp"]
moltypes = ["dna", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"]
updated_param_strings = []

for param in args.param_string:
Expand Down
Binary file added src/python/tests/test-data/SRR606249.skipm2n3.zip
Binary file not shown.
Binary file added src/python/tests/test-data/skipm1n3.zip
Binary file not shown.
Binary file added src/python/tests/test-data/skipm2n3.zip
Binary file not shown.
63 changes: 63 additions & 0 deletions src/python/tests/test_fastgather.py
Original file line number Diff line number Diff line change
Expand Up @@ -1328,3 +1328,66 @@ def test_equal_matches(runtmp):
df = pandas.read_csv(runtmp.output("out.csv"))
assert len(df) == 2
assert set(df["intersect_bp"]) == {1000}


def test_simple_skipm2n3(
runtmp, capfd, indexed_query, indexed_against, toggle_internal_storage
):
# test basic execution!
query = get_test_data("SRR606249.skipm2n3.zip")
against = get_test_data("skipm2n3.zip") # contains 2,47,63

if indexed_query:
query = index_siglist(
runtmp, query, runtmp.output("query"), scaled=100000, moltype="skipm2n3"
)

if indexed_against:
against = index_siglist(
runtmp,
against,
runtmp.output("db"),
moltype="skipm2n3",
toggle_internal_storage=toggle_internal_storage,
)

g_output = runtmp.output("gather.csv")
p_output = runtmp.output("prefetch.csv")

runtmp.sourmash(
"scripts",
"fastgather",
"--moltype",
"skipm2n3",
query,
against,
"-o",
g_output,
"-s",
"100000",
)
assert os.path.exists(g_output)

captured = capfd.readouterr()
print(captured.err)

df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert {
"query_filename",
"query_name",
"query_md5",
"match_name",
"match_md5",
"gather_result_rank",
"intersect_bp",
}.issubset(keys)

# CTB note: we do not need to worry about this warning for query from a
# RocksDB, since there is only one.
if indexed_against:
print("indexed against:", indexed_against)
assert (
"WARNING: loading all sketches from a RocksDB into memory!" in captured.err
)
42 changes: 42 additions & 0 deletions src/python/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,48 @@ def test_index_subdir(runtmp, toggle_internal_storage):
runtmp.sourmash("scripts", "check", output)


def test_index_skipm2n3(runtmp, toggle_internal_storage):
# test basic index!
sigzip = get_test_data("skipm2n3.zip")
output = runtmp.output("db.rocksdb")

runtmp.sourmash(
"scripts",
"index",
sigzip,
"-o",
output,
toggle_internal_storage,
"-m",
"skipm2n3",
)
assert os.path.exists(output)
print(runtmp.last_result.err)

assert "index is done" in runtmp.last_result.err


def test_index_skipm1n3(runtmp, toggle_internal_storage):
# test basic index!
sigzip = get_test_data("skipm1n3.zip")
output = runtmp.output("db.rocksdb")

runtmp.sourmash(
"scripts",
"index",
sigzip,
"-o",
output,
toggle_internal_storage,
"-m",
"skipm1n3",
)
assert os.path.exists(output)
print(runtmp.last_result.err)

assert "index is done" in runtmp.last_result.err


def test_index_misnamed_zipfile(runtmp, capfd):
# test with a misnamed input zipfile (a .sig.gz file renamed as zip file)
# (This is a generic test that checks to make sure misnamed zip files
Expand Down
179 changes: 179 additions & 0 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pytest
import csv
import pandas
import sourmash
from sourmash import index
Expand Down Expand Up @@ -1435,3 +1436,181 @@ def test_singlesketch_zip_output(runtmp):
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes


def test_manysketch_skipm2n3(runtmp, capfd):
fa_csv = runtmp.output("db-fa.csv")

fa1 = get_test_data("short.fa")
fa2 = get_test_data("short2.fa")
fa3 = get_test_data("short3.fa")
protfa1 = get_test_data("short-protein.fa")

make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1])

output = runtmp.output("db.zip")

runtmp.sourmash(
"scripts",
"manysketch",
fa_csv,
"-o",
output,
"--param-str",
"dna,k=21,scaled=1",
"--param-str",
"skipm2n3,k=31,scaled=30",
)

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.err)

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())
print(sigs)

assert len(sigs) == 3 # 3 dna, 3 skipmer. But sourmash can only read the DNA sigs!!
# check moltypes, etc!
dna_md5sums = {
"short": "1474578c5c46dd09da4c2df29cf86621",
"short2": "4efeebd26644278e36b9553e018a851a",
"short3": "f85747ac4f473c4a71c1740d009f512b",
}
for sig in sigs:
if sig.minhash.is_dna:
assert sig.minhash.ksize == 21
assert sig.minhash.scaled == 1
print("DNA: ", sig.name, sig.md5sum())
assert sig.md5sum() == dna_md5sums[sig.name]

# read the file with python and check sigs
import zipfile, gzip, json

with zipfile.ZipFile(output, "r") as zf:
# Check the manifest exists
assert "SOURMASH-MANIFEST.csv" in zf.namelist()

expected_signatures = [
{
"name": "short",
"ksize": 31,
"scaled": 30,
"moltype": "skipm2n3",
"md5sum": "0486fcae73545363da9cd5bfcf18d322",
},
{
"name": "short3",
"ksize": 31,
"scaled": 30,
"moltype": "skipm2n3",
"md5sum": "890557b39ae66d3177035296818de7c6",
},
{
"name": "short2",
"ksize": 31,
"scaled": 30,
"moltype": "skipm2n3",
"md5sum": "ec6305f5d82e51659f3914d47fcc32ee",
},
]
expected_signatures_dict = {exp["md5sum"]: exp for exp in expected_signatures}

# Read and parse the manifest
with zf.open("SOURMASH-MANIFEST.csv") as manifest_file:
manifest_data = manifest_file.read().decode("utf-8").splitlines()
manifest_data = [line for line in manifest_data if not line.startswith("#")]
manifest_reader = csv.DictReader(manifest_data)

for row in manifest_reader:
if row["moltype"] == "skipm2n3":
print("Manifest Row:", row)

# Validate row fields
md5sum = row["md5"]
assert (
md5sum in expected_signatures_dict
), f"Unexpected md5sum: {md5sum}"
expected = expected_signatures_dict[md5sum]
assert (
row["name"] == expected["name"]
), f"Name mismatch: {row['name']}"
assert (
int(row["ksize"]) == expected["ksize"]
), f"Ksize mismatch: {row['ksize']}"
assert (
row["moltype"] == expected["moltype"]
), f"Moltype mismatch: {row['moltype']}"

sig_path = row["internal_location"]
assert sig_path.startswith("signatures/")

# Extract and read the signature file
with zf.open(sig_path) as sig_gz:
with gzip.open(sig_gz, "rt") as sig_file:
sig_contents = json.load(sig_file)
print("Signature Contents:", sig_contents)

# Validate signature contents
sig_data = sig_contents[0]
print(sig_data)
siginfo = sig_data["signatures"][0]
assert (
siginfo["md5sum"] == md5sum
), f"MD5 mismatch: {siginfo['md5sum']}"
assert (
siginfo["ksize"] == expected["ksize"]
), f"Ksize mismatch: {siginfo['ksize']}"
assert (
siginfo["molecule"] == expected["moltype"]
), f"Moltype mismatch: {siginfo['molecule']}"


def test_singlesketch_skipm2n3(runtmp):
"""Test singlesketch with skipm2n3."""
fa1 = get_test_data("short.fa")
output = runtmp.output("short.sig")

# Run the singlesketch command
runtmp.sourmash(
"scripts", "singlesketch", fa1, "-p", "skipm2n3,k=31,scaled=100", "-o", output
)

# Check if the output exists and contains the expected data
assert os.path.exists(output)
# Load the output signature file
import json

with open(output, "r") as f:
data = json.load(f)

# Extract the signature part from the JSON
signatures = data[0]["signatures"]

# Expected signature fields
expected_signatures = [
{
"name": "short.fa",
"ksize": 31,
"moltype": "skipm2n3",
"md5sum": "387d25c8e4b4878c78872efd13621491",
}
]

# Check if the signatures match the expected
assert len(signatures) == len(
expected_signatures
), "Number of signatures does not match."

for sig, expected in zip(signatures, expected_signatures):
assert sig["ksize"] == expected["ksize"], f"Unexpected ksize: {sig['ksize']}"
assert (
sig["molecule"] == expected["moltype"]
), f"Unexpected moltype: {sig['molecule']}"
assert (
sig["md5sum"] == expected["md5sum"]
), f"Unexpected md5sum: {sig['md5sum']}"
assert (
data[0]["name"] == expected["name"]
), f"Unexpected name: {data[0]['name']}"
Loading

0 comments on commit 6a3c49a

Please sign in to comment.