Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: add skipmer capacity to sourmash python layer via ffi #3446

Merged
merged 32 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
910b76c
try skipmers
bluegenes Nov 13, 2024
117cd0d
Merge branch 'latest' into try-skipmers
bluegenes Nov 13, 2024
2bac5fd
use hash fn encoding instead; init testing
bluegenes Nov 14, 2024
fdd1a44
fix test
bluegenes Nov 14, 2024
55435d1
Merge branch 'latest' into try-skipmers
bluegenes Nov 14, 2024
ff33f7b
cont
bluegenes Nov 19, 2024
11978ed
test
bluegenes Nov 19, 2024
2de520e
skipmers must be at least ksize 3
bluegenes Nov 19, 2024
cb2e7e1
Merge branch 'latest' into try-skipmers
bluegenes Nov 20, 2024
d4a6200
clippy fix
bluegenes Nov 20, 2024
d7f59cf
Merge branch 'latest' into try-skipmers
bluegenes Nov 21, 2024
96aea47
WIP: skipmer improvements (#3415)
bluegenes Dec 12, 2024
25a16ae
Merge branch 'latest' into try-skipmers
bluegenes Dec 12, 2024
6359e16
try fix branchwater incompat
bluegenes Dec 12, 2024
43e069c
roll back web-sys too
bluegenes Dec 12, 2024
6008646
add js-sys, web-sys to dependabot ignore
bluegenes Dec 12, 2024
b20479c
add more tests
bluegenes Dec 16, 2024
e5ea8af
more tests
bluegenes Dec 16, 2024
8a0600d
test protein rf display
bluegenes Dec 16, 2024
8cb0bea
Merge branch 'latest' into try-skipmers
bluegenes Dec 16, 2024
e2d6c90
init py skipmers
bluegenes Dec 19, 2024
f374b81
init ffi py skipmers
bluegenes Dec 19, 2024
81bab97
Merge branch 'latest' into try-skipmers
bluegenes Dec 20, 2024
1f7fabd
Merge branch 'try-skipmers' into py-skipmers
bluegenes Dec 20, 2024
d66d4af
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 20, 2024
246a88f
upd bindgen
ctb Dec 20, 2024
b63d666
Merge branch 'latest' into py-skipmers
bluegenes Dec 20, 2024
0c7ed1b
more moltype
bluegenes Dec 20, 2024
379633a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 20, 2024
b6601f6
fix index for scaled
ctb Dec 20, 2024
b08d7e4
Merge branch 'latest' of github.com:sourmash-bio/sourmash into py-ski…
ctb Dec 20, 2024
017280a
rustfmt
bluegenes Dec 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ enum HashFunctions {
HASH_FUNCTIONS_MURMUR64_PROTEIN = 2,
HASH_FUNCTIONS_MURMUR64_DAYHOFF = 3,
HASH_FUNCTIONS_MURMUR64_HP = 4,
HASH_FUNCTIONS_MURMUR64_SKIPM1N3 = 5,
HASH_FUNCTIONS_MURMUR64_SKIPM2N3 = 6,
};
typedef uint32_t HashFunctions;

Expand Down Expand Up @@ -274,6 +276,10 @@ double kmerminhash_similarity(const SourmashKmerMinHash *ptr,
bool ignore_abundance,
bool downsample);

bool kmerminhash_skipm1n3(const SourmashKmerMinHash *ptr);

bool kmerminhash_skipm2n3(const SourmashKmerMinHash *ptr);

void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize);

bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr);
Expand Down
12 changes: 12 additions & 0 deletions src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,18 @@
mh.hp()
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_skipm1n3(ptr: *const SourmashKmerMinHash) -> bool {

Check warning on line 335 in src/core/src/ffi/minhash.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/ffi/minhash.rs#L335

Added line #L335 was not covered by tests
let mh = SourmashKmerMinHash::as_rust(ptr);
mh.skipm1n3()
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_skipm2n3(ptr: *const SourmashKmerMinHash) -> bool {

Check warning on line 341 in src/core/src/ffi/minhash.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/ffi/minhash.rs#L341

Added line #L341 was not covered by tests
let mh = SourmashKmerMinHash::as_rust(ptr);
mh.skipm2n3()
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_seed(ptr: *const SourmashKmerMinHash) -> u64 {
let mh = SourmashKmerMinHash::as_rust(ptr);
Expand Down
12 changes: 10 additions & 2 deletions src/core/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,40 @@
Murmur64Protein = 2,
Murmur64Dayhoff = 3,
Murmur64Hp = 4,
Murmur64Skipm1n3 = 5,
Murmur64Skipm2n3 = 6,
}

impl From<HashFunctions> for crate::encodings::HashFunctions {
fn from(v: HashFunctions) -> crate::encodings::HashFunctions {
use crate::encodings::HashFunctions::{
Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein,
Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3,
Murmur64Skipm2n3,
};
match v {
HashFunctions::Murmur64Dna => Murmur64Dna,
HashFunctions::Murmur64Protein => Murmur64Protein,
HashFunctions::Murmur64Dayhoff => Murmur64Dayhoff,
HashFunctions::Murmur64Hp => Murmur64Hp,
HashFunctions::Murmur64Skipm1n3 => Murmur64Skipm1n3,
HashFunctions::Murmur64Skipm2n3 => Murmur64Skipm2n3,

Check warning on line 55 in src/core/src/ffi/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/ffi/mod.rs#L54-L55

Added lines #L54 - L55 were not covered by tests
}
}
}

impl From<crate::encodings::HashFunctions> for HashFunctions {
fn from(v: crate::encodings::HashFunctions) -> HashFunctions {
use crate::encodings::HashFunctions::{
Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein,
Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3,
Murmur64Skipm2n3,
};
match v {
Murmur64Dna => HashFunctions::Murmur64Dna,
Murmur64Protein => HashFunctions::Murmur64Protein,
Murmur64Dayhoff => HashFunctions::Murmur64Dayhoff,
Murmur64Hp => HashFunctions::Murmur64Hp,
Murmur64Skipm1n3 => HashFunctions::Murmur64Skipm1n3,
Murmur64Skipm2n3 => HashFunctions::Murmur64Skipm2n3,
_ => todo!("Not supported, probably custom"),
}
}
Expand Down
8 changes: 8 additions & 0 deletions src/core/src/sketch/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,14 @@
self.hash_function == HashFunctions::Murmur64Hp
}

pub fn skipm1n3(&self) -> bool {

Check warning on line 726 in src/core/src/sketch/minhash.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/sketch/minhash.rs#L726

Added line #L726 was not covered by tests
self.hash_function == HashFunctions::Murmur64Skipm1n3
}

pub fn skipm2n3(&self) -> bool {

Check warning on line 730 in src/core/src/sketch/minhash.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/sketch/minhash.rs#L730

Added line #L730 was not covered by tests
self.hash_function == HashFunctions::Murmur64Skipm2n3
}

pub fn mins(&self) -> Vec<u64> {
self.mins.clone()
}
Expand Down
33 changes: 33 additions & 0 deletions src/sourmash/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,39 @@ def add_moltype_args(parser):
)
parser.set_defaults(hp=False)

parser.add_argument(
"--skipm1n3",
"--skipmer-m1n3",
dest="skipm1n3",
action="store_true",
help="choose skipmer (m1n3) signatures",
)

parser.add_argument(
"--no-skipm1n3",
"--no-skipmer-m1n3",
dest="skipm1n3",
action="store_false",
help="do not choose skipmer (m1n3) signatures",
)
parser.set_defaults(skipm1n3=False)

parser.add_argument(
"--skipm2n3",
"--skipmer-m2n3",
dest="skipm2n3",
action="store_true",
help="choose skipmer (m2n3) signatures",
)
parser.add_argument(
"--no-skipm2n3",
"--no-skipmer-m2n3",
dest="skipm2n3",
action="store_false",
help="do not choose skipmer (m2n3) signatures",
)
parser.set_defaults(skipm2n3=False)

parser.add_argument(
"--dna",
"--rna",
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1242,7 +1242,7 @@ def _check_select_parameters(**kw):

moltype = kw.get("moltype")
if moltype is not None:
if moltype not in ["DNA", "protein", "dayhoff", "hp"]:
if moltype not in ["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"]:
raise ValueError(f"unknown moltype: {moltype}")

scaled = kw.get("scaled")
Expand Down
54 changes: 50 additions & 4 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@
is_protein=False,
dayhoff=False,
hp=False,
skipm1n3=False,
skipm2n3=False,
track_abundance=False,
seed=MINHASH_DEFAULT_SEED,
max_hash=0,
Expand All @@ -215,6 +217,8 @@
* is_protein (default False) - aa k-mers
* dayhoff (default False) - dayhoff encoding
* hp (default False) - hydrophilic/hydrophobic aa
* skipm1n3 (default False) - skipmer (m1n3)
* skipm2n3 (default False) - skipmer (m2n3)
* track_abundance (default False) - track hash multiplicity
* mins (default None) - list of hashvals, or (hashval, abund) pairs
* seed (default 42) - murmurhash seed
Expand Down Expand Up @@ -243,6 +247,10 @@
elif is_protein:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
ksize = ksize * 3
elif skipm1n3:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3

Check warning on line 251 in src/sourmash/minhash.py

View check run for this annotation

Codecov / codecov/patch

src/sourmash/minhash.py#L251

Added line #L251 was not covered by tests
elif skipm2n3:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3

Check warning on line 253 in src/sourmash/minhash.py

View check run for this annotation

Codecov / codecov/patch

src/sourmash/minhash.py#L253

Added line #L253 was not covered by tests
else:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA

Expand All @@ -264,6 +272,8 @@
is_protein=self.is_protein,
dayhoff=self.dayhoff,
hp=self.hp,
skipm1n3=self.skipm1n3,
skipm2n3=self.skipm2n3,
track_abundance=self.track_abundance,
seed=self.seed,
max_hash=self._max_hash,
Expand All @@ -279,12 +289,20 @@
# note: we multiple ksize by 3 here so that
# pickle protocols that bypass __setstate__ <coff numpy coff>
# get a ksize that makes sense to the Rust layer. See #2262.
# CTB/NTP note: if you add things below, you might want to put
# them at the end, because we use internal indexes in a few places.
# see especially `_set_num_scaled()` in sig/__main__.my.
# My apologies.
return (
self.num,
self.ksize if self.is_dna else self.ksize * 3,
self.ksize
if self.is_dna or self.skipm1n3 or self.skipm2n3
else self.ksize * 3,
self.is_protein,
self.dayhoff,
self.hp,
self.skipm1n3,
self.skipm2n3,
self.hashes,
None,
self.track_abundance,
Expand All @@ -300,6 +318,8 @@
is_protein,
dayhoff,
hp,
skipm1n3,
skipm2n3,
mins,
_,
track_abundance,
Expand All @@ -316,6 +336,10 @@
if hp
else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
if is_protein
else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3
if skipm1n3
else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3
if skipm2n3
else lib.HASH_FUNCTIONS_MURMUR64_DNA
)

Expand All @@ -340,6 +364,8 @@
is_protein=self.is_protein,
dayhoff=self.dayhoff,
hp=self.hp,
skipm1n3=self.skipm1n3,
skipm2n3=self.skipm2n3,
track_abundance=self.track_abundance,
seed=self.seed,
max_hash=self._max_hash,
Expand Down Expand Up @@ -461,7 +487,7 @@

def add_kmer(self, kmer):
"Add a kmer into the sketch."
if self.is_dna:
if self.is_dna or self.skipm1n3 or self.skipm2n3:
if len(kmer) != self.ksize:
raise ValueError(f"kmer to add is not {self.ksize} in length")
else:
Expand Down Expand Up @@ -561,7 +587,9 @@

@property
def is_dna(self):
return not (self.is_protein or self.dayhoff or self.hp)
return not (
self.is_protein or self.dayhoff or self.hp or self.skipm1n3 or self.skipm2n3
)

@property
def is_protein(self):
Expand All @@ -575,10 +603,18 @@
def hp(self):
return self._methodcall(lib.kmerminhash_hp)

@property
def skipm1n3(self):
return self._methodcall(lib.kmerminhash_skipm1n3)

@property
def skipm2n3(self):
return self._methodcall(lib.kmerminhash_skipm2n3)

@property
def ksize(self):
k = self._methodcall(lib.kmerminhash_ksize)
if not self.is_dna:
if not self.is_dna and not self.skipm1n3 and not self.skipm2n3:
assert k % 3 == 0
k = int(k / 3)
return k
Expand Down Expand Up @@ -1051,6 +1087,10 @@
return "dayhoff"
elif self.hp:
return "hp"
elif self.skipm1n3:
return "skipm1n3"

Check warning on line 1091 in src/sourmash/minhash.py

View check run for this annotation

Codecov / codecov/patch

src/sourmash/minhash.py#L1091

Added line #L1091 was not covered by tests
elif self.skipm2n3:
return "skipm2n3"

Check warning on line 1093 in src/sourmash/minhash.py

View check run for this annotation

Codecov / codecov/patch

src/sourmash/minhash.py#L1093

Added line #L1093 was not covered by tests
else:
return "DNA"

Expand Down Expand Up @@ -1224,6 +1264,8 @@
is_protein,
dayhoff,
hp,
skipm1n3,
skipm2n3,
mins,
_,
track_abundance,
Expand All @@ -1240,6 +1282,10 @@
if hp
else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
if is_protein
else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3
if skipm1n3
else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3
if skipm2n3
else lib.HASH_FUNCTIONS_MURMUR64_DNA
)

Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _set_num_scaled(mh, num, scaled):
# Number of hashes is 0th parameter
mh_params[0] = num
# Scale is 8th parameter
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Scale is 8th parameter
# Scale is 10th parameter

😅

mh_params[8] = _get_max_hash_for_scaled(scaled)
mh_params[10] = _get_max_hash_for_scaled(scaled)
mh.__setstate__(mh_params)
assert mh.num == num
assert mh.scaled == scaled
Expand Down
12 changes: 10 additions & 2 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@

def get_moltype(sig, require=False):
mh = sig.minhash
if mh.moltype in ("DNA", "dayhoff", "hp", "protein"):
if mh.moltype in ("DNA", "dayhoff", "hp", "protein", "skipm1n3", "skipm2n3"):
moltype = mh.moltype
else:
raise ValueError(f"unknown molecule type for sig {sig}")
Expand All @@ -109,9 +109,15 @@
moltype = "protein"
n += 1

if args.skipm1n3:
moltype = "skipm1n3"

Check warning on line 113 in src/sourmash/sourmash_args.py

View check run for this annotation

Codecov / codecov/patch

src/sourmash/sourmash_args.py#L113

Added line #L113 was not covered by tests

if args.skipm2n3:
moltype = "skipm2n3"

Check warning on line 116 in src/sourmash/sourmash_args.py

View check run for this annotation

Codecov / codecov/patch

src/sourmash/sourmash_args.py#L116

Added line #L116 was not covered by tests

if n > 1:
error(
"cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff"
"cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff/--skipm1n3/--skipm2n3"
)
sys.exit(-1)

Expand Down Expand Up @@ -185,11 +191,13 @@

def search_pattern(vals):
return any(pattern.search(val) for val in vals)

elif args.exclude_db_pattern:
pattern = re.compile(args.exclude_db_pattern, re.IGNORECASE)

def search_pattern(vals):
return all(not pattern.search(val) for val in vals)

else:
search_pattern = None

Expand Down
Loading