From 910b76c126fb223386879071cbb51466086c14fc Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 12 Nov 2024 16:13:27 -0800 Subject: [PATCH 01/22] try skipmers --- src/core/src/ffi/minhash.rs | 4 ++-- src/core/src/signature.rs | 48 +++++++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 7d7c050fb0..444609166d 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -74,14 +74,14 @@ Result<*const u64> { let mut output: Vec = Vec::with_capacity(insize); if force && bad_kmers_as_zeroes{ - for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){ + for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed(), false){ match hash_value{ Ok(x) => output.push(x), Err(err) => return Err(err), } } }else{ - for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){ + for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed(), false){ match hash_value{ Ok(0) => continue, Ok(x) => output.push(x), diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 0ab8190f98..191f1d89f9 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -43,6 +43,7 @@ pub trait SigsTrait { false, self.hash_function(), self.seed(), + false, ); for hash_value in ready_hashes { @@ -65,6 +66,7 @@ pub trait SigsTrait { true, self.hash_function(), self.seed(), + false, ); for hash_value in ready_hashes { @@ -174,6 +176,7 @@ pub struct SeqToHashes { hash_function: HashFunctions, seed: u64, hashes_buffer: Vec, + is_skipmer: bool, dna_configured: bool, dna_rc: Vec, @@ -194,6 +197,7 @@ impl SeqToHashes { is_protein: bool, hash_function: HashFunctions, seed: u64, + is_skipmer: bool, ) -> SeqToHashes { let mut ksize: usize = k_size; @@ -220,6 +224,7 @@ impl SeqToHashes { hash_function, seed, hashes_buffer: Vec::with_capacity(1000), + is_skipmer, dna_configured: false, dna_rc: Vec::with_capacity(1000), dna_ksize: 0, @@ -266,8 +271,24 @@ impl Iterator for SeqToHashes { // Processing DNA if self.hash_function.dna() { - let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize]; + // Generate skipmer: skip every third base + let kmer: Vec = if self.is_skipmer { + // Adjust length to capture enough bases even with skipping + let extended_length = self.dna_ksize + self.dna_ksize / 2; // extend by half to compensate + self.sequence[self.kmer_index..self.kmer_index + extended_length] + .iter() + .enumerate() + .filter(|&(i, _)| i % 3 != 2) // skip every third letter + .take(self.dna_ksize) // limit to the desired final length + .map(|(_, &base)| base) + .collect() + } else { + // Regular k-mer + self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize].to_vec() + }; + // let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize]; + // validate k-mer for j in std::cmp::max(self.kmer_index, self.dna_last_position_check) ..self.kmer_index + self.dna_ksize { @@ -299,9 +320,28 @@ impl Iterator for SeqToHashes { // (leaving this table here because I had to draw to // get the indices correctly) - let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index - ..self.dna_len - self.kmer_index]; - let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); + // let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index + // ..self.dna_len - self.kmer_index]; + + let krc: Vec = if self.is_skipmer { + // Generate skipmer for reverse complement + let extended_length = self.dna_ksize + self.dna_ksize / 2; // extend by half to compensate + self.dna_rc[self.dna_len - extended_length - self.kmer_index + ..self.dna_len - self.kmer_index] + .iter() + .enumerate() + .filter(|&(i, _)| i % 3 != 2) // skip every third letter + .take(self.dna_ksize) // limit to the desired final length + .map(|(_, &base)| base) + .collect() + } else { + // reg revcomp + self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index + ..self.dna_len - self.kmer_index] + .to_vec() + }; + + let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed); self.kmer_index += 1; Some(Ok(hash)) } else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 { From 2bac5fd6a191297c203f09adedc3033821f93a97 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 13 Nov 2024 16:57:01 -0800 Subject: [PATCH 02/22] use hash fn encoding instead; init testing --- src/core/src/encodings.rs | 6 ++ src/core/src/ffi/minhash.rs | 4 +- src/core/src/signature.rs | 128 ++++++++++++++++++++++++------------ 3 files changed, 93 insertions(+), 45 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 8375699cb0..f273e14770 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -31,6 +31,7 @@ pub enum HashFunctions { Murmur64Protein, Murmur64Dayhoff, Murmur64Hp, + Murmur64Skip, Custom(String), } @@ -50,6 +51,9 @@ impl HashFunctions { pub fn hp(&self) -> bool { *self == HashFunctions::Murmur64Hp } + pub fn skipmer(&self) -> bool { + *self == HashFunctions::Murmur64Skip + } } impl std::fmt::Display for HashFunctions { @@ -62,6 +66,7 @@ impl std::fmt::Display for HashFunctions { HashFunctions::Murmur64Protein => "protein", HashFunctions::Murmur64Dayhoff => "dayhoff", HashFunctions::Murmur64Hp => "hp", + HashFunctions::Murmur64Skip => "skipmer", HashFunctions::Custom(v) => v, } ) @@ -77,6 +82,7 @@ impl TryFrom<&str> for HashFunctions { "dayhoff" => Ok(HashFunctions::Murmur64Dayhoff), "hp" => Ok(HashFunctions::Murmur64Hp), "protein" => Ok(HashFunctions::Murmur64Protein), + "skipmer" => Ok(HashFunctions::Murmur64Skip), v => unimplemented!("{v}"), } } diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index ac324f5c10..55879dd060 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -74,14 +74,14 @@ Result<*const u64> { let mut output: Vec = Vec::with_capacity(insize); if force && bad_kmers_as_zeroes{ - for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed(), false){ + for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){ match hash_value{ Ok(x) => output.push(x), Err(err) => return Err(err), } } }else{ - for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed(), false){ + for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){ match hash_value{ Ok(0) => continue, Ok(x) => output.push(x), diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index b5fbc28bc7..ca47bc49ad 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -43,7 +43,6 @@ pub trait SigsTrait { false, self.hash_function(), self.seed(), - false, ); for hash_value in ready_hashes { @@ -66,7 +65,6 @@ pub trait SigsTrait { true, self.hash_function(), self.seed(), - false, ); for hash_value in ready_hashes { @@ -176,7 +174,6 @@ pub struct SeqToHashes { hash_function: HashFunctions, seed: u64, hashes_buffer: Vec, - is_skipmer: bool, dna_configured: bool, dna_rc: Vec, @@ -197,12 +194,11 @@ impl SeqToHashes { is_protein: bool, hash_function: HashFunctions, seed: u64, - is_skipmer: bool, ) -> SeqToHashes { let mut ksize: usize = k_size; // Divide the kmer size by 3 if protein - if is_protein || !hash_function.dna() { + if is_protein || hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { ksize = k_size / 3; } @@ -224,7 +220,6 @@ impl SeqToHashes { hash_function, seed, hashes_buffer: Vec::with_capacity(1000), - is_skipmer, dna_configured: false, dna_rc: Vec::with_capacity(1000), dna_ksize: 0, @@ -260,7 +255,11 @@ impl Iterator for SeqToHashes { self.dna_ksize = self.k_size; self.dna_len = self.sequence.len(); if self.dna_len < self.dna_ksize - || (!self.hash_function.dna() && self.dna_len < self.k_size * 3) + || (self.hash_function.protein() && self.dna_len < self.k_size * 3) + || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3) + || (self.hash_function.hp() && self.dna_len < self.k_size * 3) + || (self.hash_function.skipmer() + && self.dna_len < (self.k_size + self.k_size / 2)) { return None; } @@ -271,24 +270,9 @@ impl Iterator for SeqToHashes { // Processing DNA if self.hash_function.dna() { - // Generate skipmer: skip every third base - let kmer: Vec = if self.is_skipmer { - // Adjust length to capture enough bases even with skipping - let extended_length = self.dna_ksize + self.dna_ksize / 2; // extend by half to compensate - self.sequence[self.kmer_index..self.kmer_index + extended_length] - .iter() - .enumerate() - .filter(|&(i, _)| i % 3 != 2) // skip every third letter - .take(self.dna_ksize) // limit to the desired final length - .map(|(_, &base)| base) - .collect() - } else { - // Regular k-mer - self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize].to_vec() - }; - // let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize]; + let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize]; - // validate k-mer + // validate the bases for j in std::cmp::max(self.kmer_index, self.dna_last_position_check) ..self.kmer_index + self.dna_ksize { @@ -320,26 +304,44 @@ impl Iterator for SeqToHashes { // (leaving this table here because I had to draw to // get the indices correctly) - // let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index - // ..self.dna_len - self.kmer_index]; + let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index + ..self.dna_len - self.kmer_index]; + let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); + self.kmer_index += 1; + Some(Ok(hash)) + } else if self.hash_function.skipmer() { + let extended_length = self.dna_ksize + self.dna_ksize / 2; + // Build skipmer + let kmer: Vec = self.sequence + [self.kmer_index..self.kmer_index + extended_length] + .iter() + .enumerate() + .filter(|&(i, _)| i % 3 != 2) + .take(self.dna_ksize) + .map(|(_, &base)| base) + .collect(); + + // check the bases are DNA + if kmer.iter().any(|&base| !VALID[base as usize]) { + if !self.force { + return Some(Err(Error::InvalidDNA { + message: String::from_utf8(kmer).unwrap(), + })); + } else { + self.kmer_index += 1; // Move to the next position if forced + return Some(Ok(0)); + } + } - let krc: Vec = if self.is_skipmer { - // Generate skipmer for reverse complement - let extended_length = self.dna_ksize + self.dna_ksize / 2; // extend by half to compensate - self.dna_rc[self.dna_len - extended_length - self.kmer_index - ..self.dna_len - self.kmer_index] - .iter() - .enumerate() - .filter(|&(i, _)| i % 3 != 2) // skip every third letter - .take(self.dna_ksize) // limit to the desired final length - .map(|(_, &base)| base) - .collect() - } else { - // reg revcomp - self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index - ..self.dna_len - self.kmer_index] - .to_vec() - }; + // Generate reverse complement skipmer + let krc: Vec = self.dna_rc[self.dna_len - extended_length - self.kmer_index + ..self.dna_len - self.kmer_index] + .iter() + .enumerate() + .filter(|&(i, _)| i % 3 != 2) + .take(self.dna_ksize) + .map(|(_, &base)| base) + .collect(); let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed); self.kmer_index += 1; @@ -958,6 +960,8 @@ mod test { use needletail::parse_fastx_reader; use crate::cmd::ComputeParameters; + use crate::encodings::HashFunctions; + use crate::signature::SeqToHashes; use crate::signature::SigsTrait; use super::Signature; @@ -1316,4 +1320,42 @@ mod test { assert_eq!(modified_sig.size(), 0); } } + + #[test] + fn test_seqtohashes_skipmer() { + let sequence = b"AGTCGTCAGTCG"; + let k_size = 7; + let seed = 42; + let force = true; // Force skip over invalid bases if needed + + // Initialize SeqToHashes iterator using the new constructor + let mut seq_to_hashes = SeqToHashes::new( + sequence, + k_size, + force, + false, + HashFunctions::Murmur64Skip, + seed, + ); + + // Define expected hashes for the skipmer configuration. + let expected_kmers = ["AGCGTAG", "GTGTCGT", "TCTCATC", "CGCAGCG"]; + let expected_krc = ["CTACGCT", "ACGACAC", "GATGAGA", "CGCTGCG"]; + + // Compute expected hashes by hashing each k-mer with its reverse complement + let expected_hashes: Vec = expected_kmers + .iter() + .zip(expected_krc.iter()) + .map(|(kmer, krc)| { + // Convert both kmer and krc to byte slices and pass to _hash_murmur + crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed) + }) + .collect(); + + // Compare each produced hash from the iterator with the expected hash + for expected_hash in expected_hashes { + let hash = seq_to_hashes.next().unwrap().ok().unwrap(); + assert_eq!(hash, expected_hash, "Mismatch in skipmer hash"); + } + } } From fdd1a44fda93db7f29f9862a2584ca60f870d7fa Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 14 Nov 2024 08:48:59 -0800 Subject: [PATCH 03/22] fix test --- src/core/src/signature.rs | 48 +++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index ca47bc49ad..d463639d16 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -1322,12 +1322,51 @@ mod test { } #[test] - fn test_seqtohashes_skipmer() { - let sequence = b"AGTCGTCAGTCG"; + fn test_seqtohashes_dna() { + let sequence = b"AGTCGTCA"; let k_size = 7; let seed = 42; let force = true; // Force skip over invalid bases if needed + // Initialize SeqToHashes iterator using the new constructor + let mut seq_to_hashes = SeqToHashes::new( + sequence, + k_size, + force, + false, + HashFunctions::Murmur64Dna, + seed, + ); + + // Define expected hashes for the kmer configuration. + let expected_kmers = ["AGTCGTC", "GTCGTCA"]; + let expected_krc = ["GACGACT", "TGACGAC"]; + + // Compute expected hashes by hashing each k-mer with its reverse complement + let expected_hashes: Vec = expected_kmers + .iter() + .zip(expected_krc.iter()) + .map(|(kmer, krc)| { + // Convert both kmer and krc to byte slices and pass to _hash_murmur + crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed) + }) + .collect(); + + // Compare each produced hash from the iterator with the expected hash + for expected_hash in expected_hashes { + let hash = seq_to_hashes.next().unwrap().ok().unwrap(); + assert_eq!(hash, expected_hash, "Mismatch in skipmer hash"); + } + } + + #[test] + fn test_seqtohashes_skipmer() { + let sequence = b"AGTCGTCA"; + // let rc_seq = b"TGACGACT"; + let k_size = 5; + let seed = 42; + let force = true; // Force skip over invalid bases if needed + // Initialize SeqToHashes iterator using the new constructor let mut seq_to_hashes = SeqToHashes::new( sequence, @@ -1339,8 +1378,9 @@ mod test { ); // Define expected hashes for the skipmer configuration. - let expected_kmers = ["AGCGTAG", "GTGTCGT", "TCTCATC", "CGCAGCG"]; - let expected_krc = ["CTACGCT", "ACGACAC", "GATGAGA", "CGCTGCG"]; + let expected_kmers = ["AGCGC", "GTGTA"]; + // rc of the k-mer, not of the sequence, then skipmerized. Correct? + let expected_krc = ["GCGCT", "TACAC"]; // Compute expected hashes by hashing each k-mer with its reverse complement let expected_hashes: Vec = expected_kmers From ff33f7b57bc9d11af7aaa71d2bf8f449be983f16 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 19 Nov 2024 13:19:44 -0800 Subject: [PATCH 04/22] cont --- src/core/src/encodings.rs | 8 +++---- src/core/src/signature.rs | 39 +++++++++++++++++++++------------- src/core/src/sketch/minhash.rs | 1 + 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index f273e14770..0aba272a5c 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -31,7 +31,7 @@ pub enum HashFunctions { Murmur64Protein, Murmur64Dayhoff, Murmur64Hp, - Murmur64Skip, + Murmur64Skipmer, Custom(String), } @@ -52,7 +52,7 @@ impl HashFunctions { *self == HashFunctions::Murmur64Hp } pub fn skipmer(&self) -> bool { - *self == HashFunctions::Murmur64Skip + *self == HashFunctions::Murmur64Skipmer } } @@ -66,7 +66,7 @@ impl std::fmt::Display for HashFunctions { HashFunctions::Murmur64Protein => "protein", HashFunctions::Murmur64Dayhoff => "dayhoff", HashFunctions::Murmur64Hp => "hp", - HashFunctions::Murmur64Skip => "skipmer", + HashFunctions::Murmur64Skipmer => "skipmer", HashFunctions::Custom(v) => v, } ) @@ -82,7 +82,7 @@ impl TryFrom<&str> for HashFunctions { "dayhoff" => Ok(HashFunctions::Murmur64Dayhoff), "hp" => Ok(HashFunctions::Murmur64Hp), "protein" => Ok(HashFunctions::Murmur64Protein), - "skipmer" => Ok(HashFunctions::Murmur64Skip), + "skipmer" => Ok(HashFunctions::Murmur64Skipmer), v => unimplemented!("{v}"), } } diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index d463639d16..53d186fe54 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -230,6 +230,19 @@ impl SeqToHashes { translate_iter_step: 0, } } + + fn validate_base(&self, base: u8, kmer: &Vec) -> Option> { + if !VALID[base as usize] { + if !self.force { + return Some(Err(Error::InvalidDNA { + message: String::from_utf8(kmer.clone()).unwrap_or_default(), + })); + } else { + return Some(Ok(0)); // Skip this position if forced + } + } + None // Base is valid, so return None to continue + } } /* @@ -311,26 +324,22 @@ impl Iterator for SeqToHashes { Some(Ok(hash)) } else if self.hash_function.skipmer() { let extended_length = self.dna_ksize + self.dna_ksize / 2; - // Build skipmer - let kmer: Vec = self.sequence + + // Build skipmer with DNA base validation + let mut kmer: Vec = Vec::with_capacity(self.dna_ksize); + for (_i, &base) in self.sequence [self.kmer_index..self.kmer_index + extended_length] .iter() .enumerate() .filter(|&(i, _)| i % 3 != 2) .take(self.dna_ksize) - .map(|(_, &base)| base) - .collect(); - - // check the bases are DNA - if kmer.iter().any(|&base| !VALID[base as usize]) { - if !self.force { - return Some(Err(Error::InvalidDNA { - message: String::from_utf8(kmer).unwrap(), - })); - } else { - self.kmer_index += 1; // Move to the next position if forced - return Some(Ok(0)); + { + // Use the validate_base method to check the base + if let Some(result) = self.validate_base(base, &kmer) { + self.kmer_index += 1; // Move to the next position if skipping is forced + return Some(result); } + kmer.push(base); } // Generate reverse complement skipmer @@ -1373,7 +1382,7 @@ mod test { k_size, force, false, - HashFunctions::Murmur64Skip, + HashFunctions::Murmur64Skipmer, seed, ); diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 438294e098..3c8f3b4240 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -153,6 +153,7 @@ impl<'de> Deserialize<'de> for KmerMinHash { "dayhoff" => HashFunctions::Murmur64Dayhoff, "hp" => HashFunctions::Murmur64Hp, "dna" => HashFunctions::Murmur64Dna, + "skipmer" => HashFunctions::Murmur64Skipmer, _ => unimplemented!(), // TODO: throw error here }; From 11978ed84cb9078e36128f3d55f5bb34eaad4d3e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 19 Nov 2024 15:01:21 -0800 Subject: [PATCH 05/22] test --- src/core/src/cmd.rs | 21 +++++++++++++++++++++ src/core/src/signature.rs | 30 +++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs index b59e99b912..115d2672af 100644 --- a/src/core/src/cmd.rs +++ b/src/core/src/cmd.rs @@ -42,6 +42,10 @@ pub struct ComputeParameters { #[builder(default = false)] hp: bool, + #[getset(get_copy = "pub", set = "pub")] + #[builder(default = false)] + skipmer: bool, + #[getset(get_copy = "pub", set = "pub")] #[builder(default = false)] singleton: bool, @@ -165,6 +169,23 @@ pub fn build_template(params: &ComputeParameters) -> Vec { )); } + if params.skipmer { + ksigs.push(Sketch::LargeMinHash( + KmerMinHashBTree::builder() + .num(params.num_hashes) + .ksize(*k) + .hash_function(HashFunctions::Murmur64Skipmer) + .max_hash(max_hash) + .seed(params.seed) + .abunds(if params.track_abundance { + Some(Default::default()) + } else { + None + }) + .build(), + )); + } + if params.dna { ksigs.push(Sketch::LargeMinHash( KmerMinHashBTree::builder() diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 53d186fe54..8013e506ea 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -272,7 +272,8 @@ impl Iterator for SeqToHashes { || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3) || (self.hash_function.hp() && self.dna_len < self.k_size * 3) || (self.hash_function.skipmer() - && self.dna_len < (self.k_size + self.k_size / 2)) + // add 1 to round up rather than down + && self.dna_len < (self.k_size + ((self.k_size + 1) / 2) - 1)) { return None; } @@ -323,7 +324,12 @@ impl Iterator for SeqToHashes { self.kmer_index += 1; Some(Ok(hash)) } else if self.hash_function.skipmer() { - let extended_length = self.dna_ksize + self.dna_ksize / 2; + let extended_length = self.dna_ksize + ((self.dna_ksize + 1) / 2) - 1; // add 1 to round up rather than down + + // Check bounds to ensure we don't exceed the sequence length + if self.kmer_index + extended_length > self.sequence.len() { + return None; + } // Build skipmer with DNA base validation let mut kmer: Vec = Vec::with_capacity(self.dna_ksize); @@ -1091,6 +1097,24 @@ mod test { assert_eq!(sig.signatures[1].size(), 2); } + #[test] + fn signature_skipmer_add_sequence() { + let params = ComputeParameters::builder() + .ksizes(vec![3, 6]) + .num_hashes(3u32) + .dna(false) + .skipmer(true) + .build(); + + let mut sig = Signature::from_params(¶ms); + sig.add_sequence(b"ATGCATGA", false).unwrap(); + + assert_eq!(sig.signatures.len(), 2); + dbg!(&sig.signatures); + assert_eq!(sig.signatures[0].size(), 3); + assert_eq!(sig.signatures[1].size(), 1); + } + #[test] fn signature_add_sequence_cp() { let mut cp = ComputeParameters::default(); @@ -1364,7 +1388,7 @@ mod test { // Compare each produced hash from the iterator with the expected hash for expected_hash in expected_hashes { let hash = seq_to_hashes.next().unwrap().ok().unwrap(); - assert_eq!(hash, expected_hash, "Mismatch in skipmer hash"); + assert_eq!(hash, expected_hash, "Mismatch in DNA hash"); } } From 2de520eeea6b4b75844030c512d892dcf48b4b3c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 19 Nov 2024 15:15:41 -0800 Subject: [PATCH 06/22] skipmers must be at least ksize 3 --- src/core/src/signature.rs | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 8013e506ea..3f4d98cb51 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -324,6 +324,11 @@ impl Iterator for SeqToHashes { self.kmer_index += 1; Some(Ok(hash)) } else if self.hash_function.skipmer() { + // check that we can actually build skipmers + if self.k_size < 3 { + unimplemented!() + // return None + } let extended_length = self.dna_ksize + ((self.dna_ksize + 1) / 2) - 1; // add 1 to round up rather than down // Check bounds to ensure we don't exceed the sequence length @@ -1100,7 +1105,7 @@ mod test { #[test] fn signature_skipmer_add_sequence() { let params = ComputeParameters::builder() - .ksizes(vec![3, 6]) + .ksizes(vec![3, 4, 5, 6]) .num_hashes(3u32) .dna(false) .skipmer(true) @@ -1109,10 +1114,26 @@ mod test { let mut sig = Signature::from_params(¶ms); sig.add_sequence(b"ATGCATGA", false).unwrap(); - assert_eq!(sig.signatures.len(), 2); + assert_eq!(sig.signatures.len(), 4); dbg!(&sig.signatures); assert_eq!(sig.signatures[0].size(), 3); - assert_eq!(sig.signatures[1].size(), 1); + assert_eq!(sig.signatures[1].size(), 3); + assert_eq!(sig.signatures[2].size(), 2); + assert_eq!(sig.signatures[3].size(), 1); + } + + #[test] + #[should_panic(expected = "not implemented")] + fn signature_skipmer_add_sequence_too_small() { + let params = ComputeParameters::builder() + .ksizes(vec![2]) + .num_hashes(3u32) + .dna(false) + .skipmer(true) + .build(); + + let mut sig = Signature::from_params(¶ms); + sig.add_sequence(b"ATGCATGA", false).unwrap(); } #[test] From d4a620094ba90a86dcd18197732f2bb18f7266f5 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 19 Nov 2024 17:20:21 -0800 Subject: [PATCH 07/22] clippy fix --- src/core/src/signature.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 3f4d98cb51..93580170a2 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -231,11 +231,11 @@ impl SeqToHashes { } } - fn validate_base(&self, base: u8, kmer: &Vec) -> Option> { + fn validate_base(&self, base: u8, kmer: &[u8]) -> Option> { if !VALID[base as usize] { if !self.force { return Some(Err(Error::InvalidDNA { - message: String::from_utf8(kmer.clone()).unwrap_or_default(), + message: String::from_utf8(kmer.to_owned()).unwrap_or_default(), })); } else { return Some(Ok(0)); // Skip this position if forced From 96aea47337f98289f5eda6913c37d57b5cc2ebef Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Thu, 12 Dec 2024 13:57:47 -0800 Subject: [PATCH 08/22] WIP: skipmer improvements (#3415) Make skipmers robust, but keep #3395 functional in the meantime. This PR: - enables second skipmer types, so we have m1n3 in addition to m2n3 - switches to a reading frame approach for both translation + skipmers, which means we first build the reading frame, then kmerize, rather than building kmers + translating/skipping on the fly - avoids "extended length" needed for skipping on the fly Since this changes the `SeqToHashes` strategy a bit, there's one python test where we now see a different error. Future thoughts: - with the new structure, it would be straightforward to add validation to exclude protein k-mers with invalid amino acids (`X`). I guess I'm not entirely sure what happens to those atm... --- src/core/src/cmd.rs | 27 +- src/core/src/encodings.rs | 18 +- src/core/src/signature.rs | 928 +++++++++++++++++++++++---------- src/core/src/sketch/minhash.rs | 3 +- tests/test_sourmash_sketch.py | 3 +- 5 files changed, 707 insertions(+), 272 deletions(-) diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs index 115d2672af..d5bfb092ba 100644 --- a/src/core/src/cmd.rs +++ b/src/core/src/cmd.rs @@ -44,7 +44,11 @@ pub struct ComputeParameters { #[getset(get_copy = "pub", set = "pub")] #[builder(default = false)] - skipmer: bool, + skipm1n3: bool, + + #[getset(get_copy = "pub", set = "pub")] + #[builder(default = false)] + skipm2n3: bool, #[getset(get_copy = "pub", set = "pub")] #[builder(default = false)] @@ -169,12 +173,29 @@ pub fn build_template(params: &ComputeParameters) -> Vec { )); } - if params.skipmer { + if params.skipm1n3 { + ksigs.push(Sketch::LargeMinHash( + KmerMinHashBTree::builder() + .num(params.num_hashes) + .ksize(*k) + .hash_function(HashFunctions::Murmur64Skipm1n3) + .max_hash(max_hash) + .seed(params.seed) + .abunds(if params.track_abundance { + Some(Default::default()) + } else { + None + }) + .build(), + )); + } + + if params.skipm2n3 { ksigs.push(Sketch::LargeMinHash( KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) - .hash_function(HashFunctions::Murmur64Skipmer) + .hash_function(HashFunctions::Murmur64Skipm2n3) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 0aba272a5c..56077c80e1 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -31,7 +31,8 @@ pub enum HashFunctions { Murmur64Protein, Murmur64Dayhoff, Murmur64Hp, - Murmur64Skipmer, + Murmur64Skipm1n3, + Murmur64Skipm2n3, Custom(String), } @@ -51,8 +52,13 @@ impl HashFunctions { pub fn hp(&self) -> bool { *self == HashFunctions::Murmur64Hp } - pub fn skipmer(&self) -> bool { - *self == HashFunctions::Murmur64Skipmer + + pub fn skipm1n3(&self) -> bool { + *self == HashFunctions::Murmur64Skipm1n3 + } + + pub fn skipm2n3(&self) -> bool { + *self == HashFunctions::Murmur64Skipm2n3 } } @@ -66,7 +72,8 @@ impl std::fmt::Display for HashFunctions { HashFunctions::Murmur64Protein => "protein", HashFunctions::Murmur64Dayhoff => "dayhoff", HashFunctions::Murmur64Hp => "hp", - HashFunctions::Murmur64Skipmer => "skipmer", + HashFunctions::Murmur64Skipm1n3 => "skipm1n3", + HashFunctions::Murmur64Skipm2n3 => "skipm2n3", HashFunctions::Custom(v) => v, } ) @@ -82,7 +89,8 @@ impl TryFrom<&str> for HashFunctions { "dayhoff" => Ok(HashFunctions::Murmur64Dayhoff), "hp" => Ok(HashFunctions::Murmur64Hp), "protein" => Ok(HashFunctions::Murmur64Protein), - "skipmer" => Ok(HashFunctions::Murmur64Skipmer), + "skipm1n3" => Ok(HashFunctions::Murmur64Skipm1n3), + "skipm2n3" => Ok(HashFunctions::Murmur64Skipm2n3), v => unimplemented!("{v}"), } } diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 93580170a2..5cc60ca299 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -10,6 +10,7 @@ use std::path::Path; use std::str; use cfg_if::cfg_if; +use itertools::Itertools; #[cfg(feature = "parallel")] use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -163,27 +164,149 @@ impl SigsTrait for Sketch { } } -// Iterator for converting sequence to hashes +#[derive(Debug, Clone)] +pub enum ReadingFrame { + DNA { + fw: Vec, + rc: Vec, + len: usize, // len gives max_index for kmer iterator + }, + Protein { + fw: Vec, + len: usize, + }, +} + +impl std::fmt::Display for ReadingFrame { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ReadingFrame::DNA { fw, rc, len } => { + let fw_str = String::from_utf8_lossy(fw).to_string(); + let rc_str = String::from_utf8_lossy(rc).to_string(); + write!( + f, + "Type: DNA ({}bp), Forward: {}, Reverse Complement: {}", + len, fw_str, rc_str + ) + } + ReadingFrame::Protein { fw, len } => { + let fw_str = String::from_utf8_lossy(fw).to_string(); + write!(f, "Type: Protein ({}aa), Forward: {}", len, fw_str) + } + } + } +} + +impl ReadingFrame { + pub fn new_dna(sequence: &[u8]) -> Self { + let fw = sequence.to_ascii_uppercase(); + let rc = revcomp(&fw); + let len = sequence.len(); + ReadingFrame::DNA { fw, rc, len } + } + + pub fn new_protein(sequence: &[u8], dayhoff: bool, hp: bool) -> Self { + let seq = sequence.to_ascii_uppercase(); + let fw: Vec = if dayhoff { + seq.iter().map(|&aa| aa_to_dayhoff(aa)).collect() + } else if hp { + seq.iter().map(|&aa| aa_to_hp(aa)).collect() + } else { + seq + }; + + let len = fw.len(); + ReadingFrame::Protein { fw, len } + } + + pub fn new_skipmer(sequence: &[u8], start: usize, m: usize, n: usize) -> Self { + let seq = sequence.to_ascii_uppercase(); + if start >= n { + panic!("Skipmer frame number must be < n ({})", n); + } + // do we need to round up? (+1) + let mut fw = Vec::with_capacity(((seq.len() * m) + 1) / n); + seq.iter().skip(start).enumerate().for_each(|(i, &base)| { + if i % n < m { + fw.push(base.to_ascii_uppercase()); + } + }); + + let len = fw.len(); + let rc = revcomp(&fw); + ReadingFrame::DNA { fw, rc, len } + } + + // this is the only one that doesn't uppercase in here b/c more efficient to uppercase externally :/ + pub fn new_translated(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Self { + if frame_number > 2 { + panic!("Frame number must be 0, 1, or 2"); + } + + // Translate sequence into amino acids + let mut fw = Vec::with_capacity(sequence.len() / 3); + // NOTE: b/c of chunks(3), we only process full codons and ignore leftover bases (e.g. 1 or 2 at end of frame) + sequence + .iter() + .skip(frame_number) // Skip the initial bases for the frame + .take(sequence.len() - frame_number) // Adjust length based on skipped bases + .chunks(3) // Group into codons (triplets) using itertools + .into_iter() + .filter_map(|chunk| { + let codon: Vec = chunk.cloned().collect(); // Collect the chunk into a Vec + to_aa(&codon, dayhoff, hp).ok() // Translate the codon + }) + .for_each(|aa| fw.extend(aa)); // Extend `fw` with amino acids + + let len = fw.len(); + + // return protein reading frame + ReadingFrame::Protein { fw, len } + } + + /// Get the forward sequence. + #[inline] + pub fn fw(&self) -> &[u8] { + match self { + ReadingFrame::DNA { fw, .. } => fw, + ReadingFrame::Protein { fw, .. } => fw, + } + } + + /// Get the reverse complement sequence (if DNA). + #[inline] + pub fn rc(&self) -> &[u8] { + match self { + ReadingFrame::DNA { rc, .. } => rc, + _ => panic!("Reverse complement is only available for DNA frames"), + } + } + + #[inline] + pub fn length(&self) -> usize { + match self { + ReadingFrame::DNA { len, .. } => *len, + ReadingFrame::Protein { len, .. } => *len, + } + } + + /// Get the type of the frame as a string. + pub fn frame_type(&self) -> &'static str { + match self { + ReadingFrame::DNA { .. } => "DNA", + ReadingFrame::Protein { .. } => "Protein", + } + } +} + pub struct SeqToHashes { - sequence: Vec, - kmer_index: usize, k_size: usize, - max_index: usize, force: bool, - is_protein: bool, - hash_function: HashFunctions, seed: u64, - hashes_buffer: Vec, - - dna_configured: bool, - dna_rc: Vec, - dna_ksize: usize, - dna_len: usize, - dna_last_position_check: usize, - - prot_configured: bool, - aa_seq: Vec, - translate_iter_step: usize, + frames: Vec, + frame_index: usize, // Index of the current frame + kmer_index: usize, // Current k-mer index within the frame + last_position_check: usize, // Index of last base we validated } impl SeqToHashes { @@ -194,118 +317,141 @@ impl SeqToHashes { is_protein: bool, hash_function: HashFunctions, seed: u64, - ) -> SeqToHashes { + ) -> Self { let mut ksize: usize = k_size; - // Divide the kmer size by 3 if protein + // Adjust kmer size for protein-based hash functions if is_protein || hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { ksize = k_size / 3; } - // By setting _max_index to 0, the iterator will return None and exit - let _max_index = if seq.len() >= ksize { - seq.len() - ksize + 1 + // Generate frames based on sequence type and hash function + let frames = if hash_function.dna() { + Self::dna_frames(seq) + } else if is_protein { + Self::protein_frames(seq, &hash_function) + } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { + Self::translated_frames(seq, &hash_function) + } else if hash_function.skipm1n3() || hash_function.skipm2n3() { + Self::skipmer_frames(seq, &hash_function, ksize) } else { - 0 + unimplemented!(); }; SeqToHashes { - // Here we convert the sequence to upper case - sequence: seq.to_ascii_uppercase(), k_size: ksize, - kmer_index: 0, - max_index: _max_index, force, - is_protein, - hash_function, seed, - hashes_buffer: Vec::with_capacity(1000), - dna_configured: false, - dna_rc: Vec::with_capacity(1000), - dna_ksize: 0, - dna_len: 0, - dna_last_position_check: 0, - prot_configured: false, - aa_seq: Vec::new(), - translate_iter_step: 0, + frames, + frame_index: 0, + kmer_index: 0, + last_position_check: 0, } } - fn validate_base(&self, base: u8, kmer: &[u8]) -> Option> { - if !VALID[base as usize] { - if !self.force { - return Some(Err(Error::InvalidDNA { - message: String::from_utf8(kmer.to_owned()).unwrap_or_default(), - })); - } else { - return Some(Ok(0)); // Skip this position if forced - } + /// generate frames from DNA: 1 DNA frame (fw+rc) + fn dna_frames(seq: &[u8]) -> Vec { + vec![ReadingFrame::new_dna(seq)] + } + + /// generate frames from protein: 1 protein frame + fn protein_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { + vec![ReadingFrame::new_protein( + seq, + hash_function.dayhoff(), + hash_function.hp(), + )] + } + + /// generate translated frames: 6 protein frames + fn translated_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { + // since we need to revcomp BEFORE making ReadingFrames, uppercase the sequence here + let sequence = seq.to_ascii_uppercase(); + let revcomp_sequence = revcomp(&sequence); + (0..3) + .flat_map(|frame_number| { + vec![ + ReadingFrame::new_translated( + &sequence, + frame_number, + hash_function.dayhoff(), + hash_function.hp(), + ), + ReadingFrame::new_translated( + &revcomp_sequence, + frame_number, + hash_function.dayhoff(), + hash_function.hp(), + ), + ] + }) + .collect() + } + + /// generate skipmer frames: 3 DNA frames (each with fw+rc) + fn skipmer_frames( + seq: &[u8], + hash_function: &HashFunctions, + ksize: usize, + ) -> Vec { + let (m, n) = if hash_function.skipm1n3() { + (1, 3) + } else { + (2, 3) + }; + if ksize < n { + unimplemented!() } - None // Base is valid, so return None to continue + (0..3) + .flat_map(|frame_number| vec![ReadingFrame::new_skipmer(seq, frame_number, m, n)]) + .collect() } -} -/* -Iterator that return a kmer hash for all modes except translate. -In translate mode: - - all the frames are processed at once and converted to hashes. - - all the hashes are stored in `hashes_buffer` - - after processing all the kmers, `translate_iter_step` is incremented - per iteration to iterate over all the indeces of the `hashes_buffer`. - - the iterator will die once `translate_iter_step` == length(hashes_buffer) -More info https://github.com/sourmash-bio/sourmash/pull/1946 -*/ + fn out_of_bounds(&self, frame: &ReadingFrame) -> bool { + self.kmer_index + self.k_size > frame.length() + } +} impl Iterator for SeqToHashes { type Item = Result; fn next(&mut self) -> Option { - if (self.kmer_index < self.max_index) || !self.hashes_buffer.is_empty() { - // Processing DNA or Translated DNA - if !self.is_protein { - // Setting the parameters only in the first iteration - if !self.dna_configured { - self.dna_ksize = self.k_size; - self.dna_len = self.sequence.len(); - if self.dna_len < self.dna_ksize - || (self.hash_function.protein() && self.dna_len < self.k_size * 3) - || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3) - || (self.hash_function.hp() && self.dna_len < self.k_size * 3) - || (self.hash_function.skipmer() - // add 1 to round up rather than down - && self.dna_len < (self.k_size + ((self.k_size + 1) / 2) - 1)) - { - return None; - } - // pre-calculate the reverse complement for the full sequence... - self.dna_rc = revcomp(&self.sequence); - self.dna_configured = true; - } + while self.frame_index < self.frames.len() { + let frame = &self.frames[self.frame_index]; + + // Do we need to move to the next frame? + if self.out_of_bounds(frame) { + self.frame_index += 1; + self.kmer_index = 0; // Reset for the next frame + self.last_position_check = 0; + continue; + } - // Processing DNA - if self.hash_function.dna() { - let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize]; + let result = match frame { + ReadingFrame::DNA { .. } => { + let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; + let rc = frame.rc(); - // validate the bases - for j in std::cmp::max(self.kmer_index, self.dna_last_position_check) - ..self.kmer_index + self.dna_ksize + // Validate k-mer bases + for j in std::cmp::max(self.kmer_index, self.last_position_check) + ..self.kmer_index + self.k_size { - if !VALID[self.sequence[j] as usize] { + if !VALID[frame.fw()[j] as usize] { if !self.force { + // Return an error if force is false return Some(Err(Error::InvalidDNA { message: String::from_utf8(kmer.to_vec()).unwrap(), })); } else { + // Skip the invalid k-mer self.kmer_index += 1; - // Move the iterator to the next step return Some(Ok(0)); } } - self.dna_last_position_check += 1; + self.last_position_check += 1; } - // ... and then while moving the k-mer window forward for the sequence - // we move another window backwards for the RC. + // Compute canonical hash // For a ksize = 3, and a sequence AGTCGT (len = 6): // +-+---------+---------------+-------+ // seq RC |i|i + ksize|len - ksize - i|len - i| @@ -317,146 +463,21 @@ impl Iterator for SeqToHashes { // +-+---------+---------------+-------+ // (leaving this table here because I had to draw to // get the indices correctly) - - let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index - ..self.dna_len - self.kmer_index]; + let krc = &rc[frame.length() - self.k_size - self.kmer_index + ..frame.length() - self.kmer_index]; let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); - self.kmer_index += 1; - Some(Ok(hash)) - } else if self.hash_function.skipmer() { - // check that we can actually build skipmers - if self.k_size < 3 { - unimplemented!() - // return None - } - let extended_length = self.dna_ksize + ((self.dna_ksize + 1) / 2) - 1; // add 1 to round up rather than down - - // Check bounds to ensure we don't exceed the sequence length - if self.kmer_index + extended_length > self.sequence.len() { - return None; - } - - // Build skipmer with DNA base validation - let mut kmer: Vec = Vec::with_capacity(self.dna_ksize); - for (_i, &base) in self.sequence - [self.kmer_index..self.kmer_index + extended_length] - .iter() - .enumerate() - .filter(|&(i, _)| i % 3 != 2) - .take(self.dna_ksize) - { - // Use the validate_base method to check the base - if let Some(result) = self.validate_base(base, &kmer) { - self.kmer_index += 1; // Move to the next position if skipping is forced - return Some(result); - } - kmer.push(base); - } - - // Generate reverse complement skipmer - let krc: Vec = self.dna_rc[self.dna_len - extended_length - self.kmer_index - ..self.dna_len - self.kmer_index] - .iter() - .enumerate() - .filter(|&(i, _)| i % 3 != 2) - .take(self.dna_ksize) - .map(|(_, &base)| base) - .collect(); - - let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed); - self.kmer_index += 1; - Some(Ok(hash)) - } else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 { - // Processing protein by translating DNA - // TODO: Implement iterator over frames instead of hashes_buffer. - - for frame_number in 0..3 { - let substr: Vec = self - .sequence - .iter() - .cloned() - .skip(frame_number) - .take(self.sequence.len() - frame_number) - .collect(); - - let aa = to_aa( - &substr, - self.hash_function.dayhoff(), - self.hash_function.hp(), - ) - .unwrap(); - - aa.windows(self.k_size).for_each(|n| { - let hash = crate::_hash_murmur(n, self.seed); - self.hashes_buffer.push(hash); - }); - - let rc_substr: Vec = self - .dna_rc - .iter() - .cloned() - .skip(frame_number) - .take(self.dna_rc.len() - frame_number) - .collect(); - let aa_rc = to_aa( - &rc_substr, - self.hash_function.dayhoff(), - self.hash_function.hp(), - ) - .unwrap(); - - aa_rc.windows(self.k_size).for_each(|n| { - let hash = crate::_hash_murmur(n, self.seed); - self.hashes_buffer.push(hash); - }); - } - Some(Ok(0)) - } else { - if self.translate_iter_step == self.hashes_buffer.len() { - self.hashes_buffer.clear(); - self.kmer_index = self.max_index; - return Some(Ok(0)); - } - let curr_idx = self.translate_iter_step; - self.translate_iter_step += 1; - Some(Ok(self.hashes_buffer[curr_idx])) + Ok(hash) } - } else { - // Processing protein - // The kmer size is already divided by 3 - - if self.hash_function.protein() { - let aa_kmer = &self.sequence[self.kmer_index..self.kmer_index + self.k_size]; - let hash = crate::_hash_murmur(aa_kmer, self.seed); - self.kmer_index += 1; - Some(Ok(hash)) - } else { - if !self.prot_configured { - self.aa_seq = match &self.hash_function { - HashFunctions::Murmur64Dayhoff => { - self.sequence.iter().cloned().map(aa_to_dayhoff).collect() - } - HashFunctions::Murmur64Hp => { - self.sequence.iter().cloned().map(aa_to_hp).collect() - } - invalid => { - return Some(Err(Error::InvalidHashFunction { - function: format!("{}", invalid), - })); - } - }; - } - - let aa_kmer = &self.aa_seq[self.kmer_index..self.kmer_index + self.k_size]; - let hash = crate::_hash_murmur(aa_kmer, self.seed); - self.kmer_index += 1; - Some(Ok(hash)) + ReadingFrame::Protein { .. } => { + let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; + Ok(crate::_hash_murmur(kmer, self.seed)) } - } - } else { - // End the iterator - None + }; + + self.kmer_index += 1; // Advance k-mer index for valid k-mers + return Some(result); } + None // No more frames or k-mers } } @@ -973,6 +994,7 @@ impl TryInto for Signature { #[cfg(test)] mod test { + use std::fs::File; use std::io::{BufReader, Read}; use std::path::PathBuf; @@ -1103,12 +1125,12 @@ mod test { } #[test] - fn signature_skipmer_add_sequence() { + fn signature_skipm2n3_add_sequence() { let params = ComputeParameters::builder() .ksizes(vec![3, 4, 5, 6]) .num_hashes(3u32) .dna(false) - .skipmer(true) + .skipm2n3(true) .build(); let mut sig = Signature::from_params(¶ms); @@ -1118,18 +1140,53 @@ mod test { dbg!(&sig.signatures); assert_eq!(sig.signatures[0].size(), 3); assert_eq!(sig.signatures[1].size(), 3); - assert_eq!(sig.signatures[2].size(), 2); + eprintln!("{:?}", sig.signatures[2]); + assert_eq!(sig.signatures[2].size(), 3); assert_eq!(sig.signatures[3].size(), 1); } + #[test] + fn signature_skipm1n3_add_sequence() { + let params = ComputeParameters::builder() + .ksizes(vec![3, 4, 5, 6]) + .num_hashes(10u32) + .dna(false) + .skipm1n3(true) + .build(); + + let mut sig = Signature::from_params(¶ms); + sig.add_sequence(b"ATGCATGAATGAC", false).unwrap(); + + assert_eq!(sig.signatures.len(), 4); + dbg!(&sig.signatures); + assert_eq!(sig.signatures[0].size(), 5); + assert_eq!(sig.signatures[1].size(), 4); + assert_eq!(sig.signatures[2].size(), 1); + assert_eq!(sig.signatures[3].size(), 0); + } + #[test] #[should_panic(expected = "not implemented")] - fn signature_skipmer_add_sequence_too_small() { + fn signature_skipm2n3_add_sequence_too_small() { let params = ComputeParameters::builder() .ksizes(vec![2]) - .num_hashes(3u32) + .num_hashes(10u32) .dna(false) - .skipmer(true) + .skipm2n3(true) + .build(); + + let mut sig = Signature::from_params(¶ms); + sig.add_sequence(b"ATGCATGA", false).unwrap(); + } + + #[test] + #[should_panic(expected = "not implemented")] + fn signature_skipm1n3_add_sequence_too_small() { + let params = ComputeParameters::builder() + .ksizes(vec![2]) + .num_hashes(10u32) + .dna(false) + .skipm1n3(true) .build(); let mut sig = Signature::from_params(¶ms); @@ -1375,19 +1432,213 @@ mod test { } } + #[test] + fn test_seqtohashes_frames_dna() { + let sequence = b"AGTCGT"; + let hash_function = HashFunctions::Murmur64Dna; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].fw(), sequence.as_slice()); + assert_eq!(frames[0].rc(), b"ACGACT".as_slice()); + } + + #[test] + fn test_seqtohashes_frames_is_protein() { + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = true; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].fw(), sequence.as_slice()); + } + + #[test] + #[should_panic] + fn test_seqtohashes_frames_is_protein_try_access_rc() { + // test panic if trying to access rc + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = true; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + // protein frame doesn't have rc; this should panic + eprintln!("{:?}", frames[0].rc()); + } + + #[test] + fn test_seqtohashes_frames_is_protein_dayhoff() { + let sequence = b"MVLSPADKTNVKAAW"; + let dayhoff_seq = b"eeebbbcdbcedbbf"; + let hash_function = HashFunctions::Murmur64Dayhoff; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = true; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].fw(), dayhoff_seq.as_slice()); + } + + #[test] + fn test_seqtohashes_frames_is_protein_hp() { + let sequence = b"MVLSPADKTNVKAAW"; + let hp_seq = b"hhhphhpppphphhh"; + let hash_function = HashFunctions::Murmur64Hp; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = true; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].fw(), hp_seq.as_slice()); + } + + #[test] + fn test_seqtohashes_frames_translate_protein() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Protein; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + assert_eq!(frames[0].fw(), b"SRRA".as_slice()); + assert_eq!(frames[1].fw(), b"SSTT".as_slice()); + assert_eq!(frames[2].fw(), b"VVE".as_slice()); + assert_eq!(frames[3].fw(), b"ARR".as_slice()); + assert_eq!(frames[4].fw(), b"SSS".as_slice()); + assert_eq!(frames[5].fw(), b"LDD".as_slice()); + } + + #[test] + fn test_seqtohashes_frames_skipmer_m1n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm1n3; // Represents m=1, n=3 + let k_size = 3; // K-mer size is not directly relevant for skipmer frame validation + let seed = 42; // Seed is also irrelevant for frame structure + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + eprintln!("Frames: {:?}", frames); + + assert_eq!(frames.len(), 3); // Three skipmer frames + + // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + assert_eq!(frames[0].fw(), b"ACCG".as_slice()); + assert_eq!(frames[0].rc(), b"CGGT".as_slice()); + + assert_eq!(frames[1].fw(), b"GGGC".as_slice()); + assert_eq!(frames[1].rc(), b"GCCC".as_slice()); + + assert_eq!(frames[2].fw(), b"TTAT".as_slice()); + assert_eq!(frames[2].rc(), b"ATAA".as_slice()); + } + + #[test] + fn test_seqtohashes_frames_skipmer_m2n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm2n3; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames; + eprintln!("Frames: {:?}", frames); + + assert_eq!(frames.len(), 3); // Three skipmer frames + + // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + assert_eq!(frames[0].fw(), b"AGCGCGGC".as_slice()); + assert_eq!(frames[0].rc(), b"GCCGCGCT".as_slice()); + + assert_eq!(frames[1].fw(), b"GTGTGACT".as_slice()); + assert_eq!(frames[1].rc(), b"AGTCACAC".as_slice()); + + assert_eq!(frames[2].fw(), b"TCTCAGT".as_slice()); + assert_eq!(frames[2].rc(), b"ACTGAGA".as_slice()); + } + #[test] fn test_seqtohashes_dna() { + let sequence = b"AGTCGT"; + let hash_function = HashFunctions::Murmur64Dna; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + + // Expected k-mers from the forward and reverse complement sequence + let expected_kmers = vec![ + (b"AGT".to_vec(), b"ACT".to_vec()), + (b"GTC".to_vec(), b"GAC".to_vec()), + (b"TCG".to_vec(), b"CGA".to_vec()), + (b"CGT".to_vec(), b"ACG".to_vec()), + ]; + + // Compute expected hashes from expected kmers + let expected_hashes: Vec = expected_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) + .collect(); + + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); + + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" + ); + } + + #[test] + fn test_seqtohashes_dna_2() { let sequence = b"AGTCGTCA"; let k_size = 7; let seed = 42; let force = true; // Force skip over invalid bases if needed - + let is_protein = false; // Initialize SeqToHashes iterator using the new constructor let mut seq_to_hashes = SeqToHashes::new( sequence, k_size, force, - false, + is_protein, HashFunctions::Murmur64Dna, seed, ); @@ -1414,42 +1665,195 @@ mod test { } #[test] - fn test_seqtohashes_skipmer() { - let sequence = b"AGTCGTCA"; - // let rc_seq = b"TGACGACT"; - let k_size = 5; + fn test_seqtohashes_is_protein() { + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + let k_size = 3; let seed = 42; - let force = true; // Force skip over invalid bases if needed + let force = false; + let is_protein = true; + + let sth = SeqToHashes::new(sequence, k_size * 3, force, is_protein, hash_function, seed); + + // Expected k-mers for protein sequence + let expected_kmers = vec![ + b"MVL".to_vec(), + b"VLS".to_vec(), + b"LSP".to_vec(), + b"SPA".to_vec(), + b"PAD".to_vec(), + b"ADK".to_vec(), + b"DKT".to_vec(), + b"KTN".to_vec(), + b"TNV".to_vec(), + b"NVK".to_vec(), + b"VKA".to_vec(), + b"KAA".to_vec(), + b"AAW".to_vec(), + ]; + + // Compute hashes for expected k-mers + let expected_hashes: Vec = expected_kmers + .iter() + .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + .collect(); - // Initialize SeqToHashes iterator using the new constructor - let mut seq_to_hashes = SeqToHashes::new( - sequence, - k_size, - force, - false, - HashFunctions::Murmur64Skipmer, - seed, + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); + + // Check that SeqToHashes matches expected hashes in order + assert_eq!(sth_hashes, expected_hashes, "Hashes do not match in order"); + } + + #[test] + fn test_seqtohashes_translate() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Protein; + let k_size = 9; // needs to be *3 for protein + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + + let expected_kmers = vec![ + b"SRR".as_slice(), + b"RRA".as_slice(), + b"SST".as_slice(), + b"STT".as_slice(), + b"VVE".as_slice(), + b"ARR".as_slice(), + b"SSS".as_slice(), + b"LDD".as_slice(), + ]; + + // Compute expected hashes + let expected_hashes: Vec = expected_kmers + .iter() + .map(|fw_kmer| crate::_hash_murmur(fw_kmer, seed)) + .collect(); + + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); + + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" ); + } + + #[test] + fn test_seqtohashes_skipm1n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm1n3; + let k_size = 3; + let is_protein = false; + let seed = 42; + let force = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + // Expected k-mers for skipmer (m=1, n=3) across all frames + let expected_kmers = vec![ + (b"ACC".as_slice(), b"GGT".as_slice()), + (b"CCG".as_slice(), b"CGG".as_slice()), + (b"GGG".as_slice(), b"CCC".as_slice()), + (b"GGC".as_slice(), b"GCC".as_slice()), + (b"TTA".as_slice(), b"TAA".as_slice()), + (b"TAT".as_slice(), b"ATA".as_slice()), + ]; + + // Compute expected hashes + let expected_hashes: Vec = expected_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) + .collect(); - // Define expected hashes for the skipmer configuration. - let expected_kmers = ["AGCGC", "GTGTA"]; - // rc of the k-mer, not of the sequence, then skipmerized. Correct? - let expected_krc = ["GCGCT", "TACAC"]; + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); - // Compute expected hashes by hashing each k-mer with its reverse complement + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" + ); + } + + #[test] + fn test_seq2hashes_skipm2n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm2n3; + let k_size = 7; + let is_protein = false; + let seed = 42; + let force = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + + // Expected k-mers for skipmer (m=2, n=3) + let expected_kmers = vec![ + (b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()), + (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice()), + (b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()), + (b"TGTGACT".as_slice(), b"AGTCACA".as_slice()), + (b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice()), + ]; + + // Compute expected hashes let expected_hashes: Vec = expected_kmers .iter() - .zip(expected_krc.iter()) - .map(|(kmer, krc)| { - // Convert both kmer and krc to byte slices and pass to _hash_murmur - crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed) - }) + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) .collect(); - // Compare each produced hash from the iterator with the expected hash - for expected_hash in expected_hashes { - let hash = seq_to_hashes.next().unwrap().ok().unwrap(); - assert_eq!(hash, expected_hash, "Mismatch in skipmer hash"); + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); + + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" + ); + } + + #[test] + fn test_seqtohashes_skipm2n3_2() { + let sequence = b"AGTCGTCA"; + let hash_function = HashFunctions::Murmur64Skipm2n3; + let k_size = 5; + let seed = 42; + let force = true; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + for fr in frames { + eprintln!("{}", fr); } + + let expected_kmers = vec![ + (b"AGCGC".as_slice(), b"GCGCT".as_slice()), + (b"GCGCA".as_slice(), b"TGCGC".as_slice()), + (b"GTGTA".as_slice(), b"TACAC".as_slice()), + ]; + + // Compute expected hashes + let expected_hashes: Vec = expected_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) + .collect(); + + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); + + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" + ); } } diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 3c8f3b4240..d8ca2337a0 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -153,7 +153,8 @@ impl<'de> Deserialize<'de> for KmerMinHash { "dayhoff" => HashFunctions::Murmur64Dayhoff, "hp" => HashFunctions::Murmur64Hp, "dna" => HashFunctions::Murmur64Dna, - "skipmer" => HashFunctions::Murmur64Skipmer, + "skipm1n3" => HashFunctions::Murmur64Skipm1n3, + "skipm2n3" => HashFunctions::Murmur64Skipm2n3, _ => unimplemented!(), // TODO: throw error here }; diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 88332b1945..a9af0fdd24 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -345,6 +345,7 @@ def test_protein_override_bad_rust_foo(): siglist = factory() assert len(siglist) == 1 sig = siglist[0] + print(sig.minhash.ksize) # try adding something testdata1 = utils.get_test_data("ecoli.faa") @@ -354,7 +355,7 @@ def test_protein_override_bad_rust_foo(): with pytest.raises(ValueError) as exc: sig.add_protein(record.sequence) - assert 'Invalid hash function: "DNA"' in str(exc) + assert "invalid DNA character in input k-mer: MRVLKFGGTS" in str(exc) def test_dayhoff_defaults(): From 6359e165cc02fe72b99a02c0fe03a8afc055cf81 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 12 Dec 2024 14:58:59 -0800 Subject: [PATCH 09/22] try fix branchwater incompat --- src/core/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 41cf944fac..469bb2bbe7 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -96,7 +96,7 @@ skip_feature_sets = [ ## Wasm section. Crates only used for WASM, as well as specific configurations [target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies] -js-sys = "0.3.74" +js-sys = "0.3.72" web-sys = { version = "0.3.74", features = ["console", "File", "FileReaderSync"] } wasm-bindgen = "0.2.89" getrandom = { version = "0.2", features = ["js"] } From 43e069c7cb7217bb8c1c812ad70c4db301015c46 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 12 Dec 2024 15:00:54 -0800 Subject: [PATCH 10/22] roll back web-sys too --- src/core/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 469bb2bbe7..006adae4d7 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -97,7 +97,7 @@ skip_feature_sets = [ [target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies] js-sys = "0.3.72" -web-sys = { version = "0.3.74", features = ["console", "File", "FileReaderSync"] } +web-sys = { version = "0.3.72", features = ["console", "File", "FileReaderSync"] } wasm-bindgen = "0.2.89" getrandom = { version = "0.2", features = ["js"] } From 600864663c36d4a90d407aac6260ab1e1467be59 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 12 Dec 2024 15:04:57 -0800 Subject: [PATCH 11/22] add js-sys, web-sys to dependabot ignore --- .github/dependabot.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index d01a55a915..50a37271e5 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -17,6 +17,8 @@ updates: - dependency-name: "wasm-bindgen" - dependency-name: "once_cell" - dependency-name: "chrono" + - dependency-name: "js-sys" + - dependency-name: "web-sys" - package-ecosystem: "github-actions" directory: "/" schedule: From b20479c3f83f0015e227c4e6fad814014bdcbc31 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 16 Dec 2024 11:44:33 -0800 Subject: [PATCH 12/22] add more tests --- src/core/src/encodings.rs | 91 +++++++++++++++++++++++++++++++++++++++ src/core/src/signature.rs | 34 ++++++++++++++- 2 files changed, 123 insertions(+), 2 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 56077c80e1..06af5d03c1 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -520,6 +520,7 @@ impl<'a> Iterator for Indices<'a> { #[cfg(test)] mod test { use super::*; + use std::convert::TryFrom; #[test] fn colors_update() { @@ -587,4 +588,94 @@ mod test { assert_eq!(colors.len(), 2); } + + #[test] + fn test_dna_method() { + assert!(HashFunctions::Murmur64Dna.dna()); + assert!(!HashFunctions::Murmur64Protein.dna()); + assert!(!HashFunctions::Murmur64Dayhoff.dna()); + } + + #[test] + fn test_protein_method() { + assert!(HashFunctions::Murmur64Protein.protein()); + assert!(!HashFunctions::Murmur64Dna.protein()); + assert!(!HashFunctions::Murmur64Dayhoff.protein()); + } + + #[test] + fn test_dayhoff_method() { + assert!(HashFunctions::Murmur64Dayhoff.dayhoff()); + assert!(!HashFunctions::Murmur64Dna.dayhoff()); + assert!(!HashFunctions::Murmur64Protein.dayhoff()); + } + + #[test] + fn test_hp_method() { + assert!(HashFunctions::Murmur64Hp.hp()); + assert!(!HashFunctions::Murmur64Dna.hp()); + assert!(!HashFunctions::Murmur64Protein.hp()); + } + + #[test] + fn test_skipm1n3_method() { + assert!(HashFunctions::Murmur64Skipm1n3.skipm1n3()); + assert!(!HashFunctions::Murmur64Dna.skipm1n3()); + assert!(!HashFunctions::Murmur64Protein.skipm1n3()); + } + + #[test] + fn test_skipm2n3_method() { + assert!(HashFunctions::Murmur64Skipm2n3.skipm2n3()); + assert!(!HashFunctions::Murmur64Dna.skipm2n3()); + assert!(!HashFunctions::Murmur64Protein.skipm2n3()); + } + + #[test] + fn test_display_hashfunctions() { + assert_eq!(HashFunctions::Murmur64Dna.to_string(), "DNA"); + assert_eq!(HashFunctions::Murmur64Protein.to_string(), "protein"); + assert_eq!(HashFunctions::Murmur64Dayhoff.to_string(), "dayhoff"); + assert_eq!(HashFunctions::Murmur64Hp.to_string(), "hp"); + assert_eq!(HashFunctions::Murmur64Skipm1n3.to_string(), "skipm1n3"); + assert_eq!(HashFunctions::Murmur64Skipm2n3.to_string(), "skipm2n3"); + assert_eq!( + HashFunctions::Custom("custom_string".into()).to_string(), + "custom_string" + ); + } + + #[test] + fn test_try_from_str_valid() { + assert_eq!( + HashFunctions::try_from("dna").unwrap(), + HashFunctions::Murmur64Dna + ); + assert_eq!( + HashFunctions::try_from("protein").unwrap(), + HashFunctions::Murmur64Protein + ); + assert_eq!( + HashFunctions::try_from("dayhoff").unwrap(), + HashFunctions::Murmur64Dayhoff + ); + assert_eq!( + HashFunctions::try_from("hp").unwrap(), + HashFunctions::Murmur64Hp + ); + assert_eq!( + HashFunctions::try_from("skipm1n3").unwrap(), + HashFunctions::Murmur64Skipm1n3 + ); + assert_eq!( + HashFunctions::try_from("skipm2n3").unwrap(), + HashFunctions::Murmur64Skipm2n3 + ); + } + + #[test] + #[should_panic(expected = "not implemented: unknown")] + fn test_try_from_str_invalid() { + HashFunctions::try_from("unknown").unwrap(); + } } diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 5cc60ca299..7fdb9b7f2d 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -995,6 +995,7 @@ impl TryInto for Signature { #[cfg(test)] mod test { + use core::num; use std::fs::File; use std::io::{BufReader, Read}; use std::path::PathBuf; @@ -1003,8 +1004,7 @@ mod test { use crate::cmd::ComputeParameters; use crate::encodings::HashFunctions; - use crate::signature::SeqToHashes; - use crate::signature::SigsTrait; + use crate::signature::{ReadingFrame, SeqToHashes, SigsTrait}; use super::Signature; @@ -1432,6 +1432,15 @@ mod test { } } + #[test] + fn test_readingframe_dna() { + let sequence = b"AGTCGT"; + let frame = ReadingFrame::new_dna(sequence); + + assert_eq!(frame.fw(), sequence.as_slice()); + assert_eq!(frame.rc(), b"ACGACT".as_slice()); + } + #[test] fn test_seqtohashes_frames_dna() { let sequence = b"AGTCGT"; @@ -1465,6 +1474,16 @@ mod test { assert_eq!(frames[0].fw(), sequence.as_slice()); } + #[test] + fn test_readingframe_protein() { + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + let frame = + ReadingFrame::new_protein(sequence, hash_function.dayhoff(), hash_function.hp()); + + assert_eq!(frame.fw(), sequence.as_slice()); + } + #[test] #[should_panic] fn test_seqtohashes_frames_is_protein_try_access_rc() { @@ -1537,6 +1556,17 @@ mod test { assert_eq!(frames[5].fw(), b"LDD".as_slice()); } + #[test] + #[should_panic(expected = "Skipmer frame number must be < n")] + fn test_readingframe_skipmer() { + let sequence = b"AGTCGT"; + let m = 2; + let n = 3; + let num_frames = 4; // four frames but n is only 3 + + ReadingFrame::new_skipmer(sequence, num_frames, m, n); + } + #[test] fn test_seqtohashes_frames_skipmer_m1n3() { let sequence = b"AGTCGTCGAGCT"; From e5ea8af3a6734ab1b67a74ae951cd3cc7976b1ff Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 16 Dec 2024 11:55:51 -0800 Subject: [PATCH 13/22] more tests --- src/core/src/signature.rs | 87 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 7fdb9b7f2d..f7f3836fc2 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -995,7 +995,6 @@ impl TryInto for Signature { #[cfg(test)] mod test { - use core::num; use std::fs::File; use std::io::{BufReader, Read}; use std::path::PathBuf; @@ -1441,6 +1440,83 @@ mod test { assert_eq!(frame.rc(), b"ACGACT".as_slice()); } + #[test] + fn test_fw_dna() { + let dna_frame = ReadingFrame::DNA { + fw: b"ATCG".to_vec(), + rc: b"CGAT".to_vec(), + len: 4, + }; + assert_eq!(dna_frame.fw(), b"ATCG"); + } + + #[test] + fn test_rc_dna() { + let dna_frame = ReadingFrame::DNA { + fw: b"ATCG".to_vec(), + rc: b"CGAT".to_vec(), + len: 4, + }; + assert_eq!(dna_frame.rc(), b"CGAT"); + } + + #[test] + fn test_length_dna() { + let dna_frame = ReadingFrame::DNA { + fw: b"ATCG".to_vec(), + rc: b"CGAT".to_vec(), + len: 4, + }; + assert_eq!(dna_frame.length(), 4); + } + + #[test] + fn test_frame_type_dna() { + let dna_frame = ReadingFrame::DNA { + fw: b"ATCG".to_vec(), + rc: b"CGAT".to_vec(), + len: 4, + }; + assert_eq!(dna_frame.frame_type(), "DNA"); + } + + #[test] + fn test_fw_protein() { + let protein_frame = ReadingFrame::Protein { + fw: b"MVHL".to_vec(), + len: 4, + }; + assert_eq!(protein_frame.fw(), b"MVHL"); + } + + #[test] + #[should_panic(expected = "Reverse complement is only available for DNA frames")] + fn test_rc_protein_panics() { + let protein_frame = ReadingFrame::Protein { + fw: b"MVHL".to_vec(), + len: 4, + }; + protein_frame.rc(); + } + + #[test] + fn test_length_protein() { + let protein_frame = ReadingFrame::Protein { + fw: b"MVHL".to_vec(), + len: 4, + }; + assert_eq!(protein_frame.length(), 4); + } + + #[test] + fn test_frame_type_protein() { + let protein_frame = ReadingFrame::Protein { + fw: b"MVHL".to_vec(), + len: 4, + }; + assert_eq!(protein_frame.frame_type(), "Protein"); + } + #[test] fn test_seqtohashes_frames_dna() { let sequence = b"AGTCGT"; @@ -1556,6 +1632,15 @@ mod test { assert_eq!(frames[5].fw(), b"LDD".as_slice()); } + #[test] + #[should_panic(expected = "Frame number must be 0, 1, or 2")] + fn test_readingframe_translate() { + let sequence = b"AGTCGT"; + let frame_start = 3; // four frames but translate can only + + ReadingFrame::new_translated(sequence, frame_start, false, false); + } + #[test] #[should_panic(expected = "Skipmer frame number must be < n")] fn test_readingframe_skipmer() { From 8a0600d79d41fe34e53a860dc62d89537087eac3 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 16 Dec 2024 12:43:42 -0800 Subject: [PATCH 14/22] test protein rf display --- src/core/src/signature.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index f7f3836fc2..743b5f3e57 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -1517,6 +1517,19 @@ mod test { assert_eq!(protein_frame.frame_type(), "Protein"); } + #[test] + fn test_readingframe_display_protein() { + // Create a Protein ReadingFrame + let protein_frame = ReadingFrame::Protein { + fw: b"MVHLK".to_vec(), + len: 5, + }; + + let output = format!("{}", protein_frame); + // Assert the output matches the expected format + assert_eq!(output, "Type: Protein (5aa), Forward: MVHLK"); + } + #[test] fn test_seqtohashes_frames_dna() { let sequence = b"AGTCGT"; From e2d6c90e622522bd43051277f9ebda826b84fb8e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 19 Dec 2024 13:38:41 -0800 Subject: [PATCH 15/22] init py skipmers --- src/core/src/ffi/mod.rs | 10 ++++++++-- src/sourmash/index/__init__.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index 6f1dff78e4..2a72fd549e 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -36,18 +36,22 @@ pub enum HashFunctions { Murmur64Protein = 2, Murmur64Dayhoff = 3, Murmur64Hp = 4, + Murmur64Skipm1n3 = 5, + Murmur64Skipm2n3 = 6, } impl From for crate::encodings::HashFunctions { fn from(v: HashFunctions) -> crate::encodings::HashFunctions { use crate::encodings::HashFunctions::{ - Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, + Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, Murmur64Skipm2n3, }; match v { HashFunctions::Murmur64Dna => Murmur64Dna, HashFunctions::Murmur64Protein => Murmur64Protein, HashFunctions::Murmur64Dayhoff => Murmur64Dayhoff, HashFunctions::Murmur64Hp => Murmur64Hp, + HashFunctions::Murmur64Skipm1n3 => Murmur64Skipm1n3, + HashFunctions::Murmur64Skipm2n3 => Murmur64Skipm2n3, } } } @@ -55,13 +59,15 @@ impl From for crate::encodings::HashFunctions { impl From for HashFunctions { fn from(v: crate::encodings::HashFunctions) -> HashFunctions { use crate::encodings::HashFunctions::{ - Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, + Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, Murmur64Skipm2n3, }; match v { Murmur64Dna => HashFunctions::Murmur64Dna, Murmur64Protein => HashFunctions::Murmur64Protein, Murmur64Dayhoff => HashFunctions::Murmur64Dayhoff, Murmur64Hp => HashFunctions::Murmur64Hp, + Murmur64Skipm1n3 => HashFunctions::Murmur64Skipm1n3, + Murmur64Skipm2n3 => HashFunctions::Murmur64Skipm2n3, _ => todo!("Not supported, probably custom"), } } diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index a8a638653d..b04952d873 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -1242,7 +1242,7 @@ def _check_select_parameters(**kw): moltype = kw.get("moltype") if moltype is not None: - if moltype not in ["DNA", "protein", "dayhoff", "hp"]: + if moltype not in ["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"]: raise ValueError(f"unknown moltype: {moltype}") scaled = kw.get("scaled") From f374b81353e538a89e5ca3c7ff6027638c5d3d7d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 19 Dec 2024 15:20:13 -0800 Subject: [PATCH 16/22] init ffi py skipmers --- src/core/src/ffi/minhash.rs | 12 +++++++++++ src/core/src/sketch/minhash.rs | 8 +++++++ src/sourmash/cli/utils.py | 33 +++++++++++++++++++++++++++++ src/sourmash/minhash.py | 38 +++++++++++++++++++++++++++++++++- src/sourmash/sourmash_args.py | 12 +++++++++-- 5 files changed, 100 insertions(+), 3 deletions(-) diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 55879dd060..a6fda53d1c 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -320,6 +320,18 @@ pub unsafe extern "C" fn kmerminhash_hp(ptr: *const SourmashKmerMinHash) -> bool mh.hp() } +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_skipm1n3(ptr: *const SourmashKmerMinHash) -> bool { + let mh = SourmashKmerMinHash::as_rust(ptr); + mh.skipm1n3() +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_skipm2n3(ptr: *const SourmashKmerMinHash) -> bool { + let mh = SourmashKmerMinHash::as_rust(ptr); + mh.skipm2n3() +} + #[no_mangle] pub unsafe extern "C" fn kmerminhash_seed(ptr: *const SourmashKmerMinHash) -> u64 { let mh = SourmashKmerMinHash::as_rust(ptr); diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index d8ca2337a0..9cd9f78b9b 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -711,6 +711,14 @@ impl KmerMinHash { self.hash_function == HashFunctions::Murmur64Hp } + pub fn skipm1n3(&self) -> bool { + self.hash_function == HashFunctions::Murmur64Skipm1n3 + } + + pub fn skipm2n3(&self) -> bool { + self.hash_function == HashFunctions::Murmur64Skipm2n3 + } + pub fn mins(&self) -> Vec { self.mins.clone() } diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 26da5ead5f..82a66dfe9b 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -50,6 +50,39 @@ def add_moltype_args(parser): ) parser.set_defaults(hp=False) + parser.add_argument( + "--skipm1n3", + "--skipmer-m1n3", + dest="skipm1n3", + action="store_true", + help="choose skipmer (m1n3) signatures", + ) + + parser.add_argument( + "--no-skipm1n3", + "--no-skipmer-m1n3", + dest="skipm1n3", + action="store_false", + help="do not choose skipmer (m1n3) signatures", + ) + parser.set_defaults(skipm1n3=False) + + parser.add_argument( + "--skipm2n3", + "--skipmer-m2n3", + dest="skipm2n3", + action="store_true", + help="choose skipmer (m2n3) signatures", + ) + parser.add_argument( + "--no-skipm2n3", + "--no-skipmer-m2n3", + dest="skipm2n3", + action="store_false", + help="do not choose skipmer (m2n3) signatures", + ) + parser.set_defaults(skipm2n3=False) + parser.add_argument( "--dna", "--rna", diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 90d8244e2d..7188ed7427 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -196,6 +196,8 @@ def __init__( is_protein=False, dayhoff=False, hp=False, + skipm1n3=False, + skipm2n3=False, track_abundance=False, seed=MINHASH_DEFAULT_SEED, max_hash=0, @@ -215,6 +217,8 @@ def __init__( * is_protein (default False) - aa k-mers * dayhoff (default False) - dayhoff encoding * hp (default False) - hydrophilic/hydrophobic aa + * skipm1n3 (default False) - skipmer (m1n3) + * skipm2n3 (default False) - skipmer (m2n3) * track_abundance (default False) - track hash multiplicity * mins (default None) - list of hashvals, or (hashval, abund) pairs * seed (default 42) - murmurhash seed @@ -243,6 +247,10 @@ def __init__( elif is_protein: hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN ksize = ksize * 3 + elif skipm1n3: + hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3 + elif skipm2n3: + hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3 else: hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA @@ -264,6 +272,8 @@ def __copy__(self): is_protein=self.is_protein, dayhoff=self.dayhoff, hp=self.hp, + skipm1n3=self.skipm1n3, + skipm2n3=self.skipm2n3, track_abundance=self.track_abundance, seed=self.seed, max_hash=self._max_hash, @@ -285,6 +295,8 @@ def __getstate__(self): self.is_protein, self.dayhoff, self.hp, + self.skipm1n3, + self.skipm2n3, self.hashes, None, self.track_abundance, @@ -300,6 +312,8 @@ def __setstate__(self, tup): is_protein, dayhoff, hp, + skipm1n3, + skipm2n3, mins, _, track_abundance, @@ -316,6 +330,10 @@ def __setstate__(self, tup): if hp else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein + else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3 + if skipm1n3 + else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3 + if skipm2n3 else lib.HASH_FUNCTIONS_MURMUR64_DNA ) @@ -340,6 +358,8 @@ def copy_and_clear(self): is_protein=self.is_protein, dayhoff=self.dayhoff, hp=self.hp, + skipm1n3=self.skipm1n3, + skipm2n3=self.skipm2n3, track_abundance=self.track_abundance, seed=self.seed, max_hash=self._max_hash, @@ -561,7 +581,9 @@ def scaled(self): @property def is_dna(self): - return not (self.is_protein or self.dayhoff or self.hp) + return not ( + self.is_protein or self.dayhoff or self.hp or self.skipm1n3 or self.skipm2n3 + ) @property def is_protein(self): @@ -575,6 +597,14 @@ def dayhoff(self): def hp(self): return self._methodcall(lib.kmerminhash_hp) + @property + def skipm1n3(self): + return self._methodcall(lib.kmerminhash_skipm1n3) + + @property + def skipm2n3(self): + return self._methodcall(lib.kmerminhash_skipm2n3) + @property def ksize(self): k = self._methodcall(lib.kmerminhash_ksize) @@ -1224,6 +1254,8 @@ def __setstate__(self, tup): is_protein, dayhoff, hp, + skipm1n3, + skipm2n3, mins, _, track_abundance, @@ -1240,6 +1272,10 @@ def __setstate__(self, tup): if hp else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein + else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3 + if skipm1n3 + else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3 + if skipm2n3 else lib.HASH_FUNCTIONS_MURMUR64_DNA ) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 9f2068160a..20028fa921 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -84,7 +84,7 @@ def check_num_bounds(arg): def get_moltype(sig, require=False): mh = sig.minhash - if mh.moltype in ("DNA", "dayhoff", "hp", "protein"): + if mh.moltype in ("DNA", "dayhoff", "hp", "protein", "skipm1n3", "skipm2n3"): moltype = mh.moltype else: raise ValueError(f"unknown molecule type for sig {sig}") @@ -109,9 +109,15 @@ def calculate_moltype(args, default=None): moltype = "protein" n += 1 + if args.skipm1n3: + moltype = "skipm1n3" + + if args.skipm2n3: + moltype = "skipm2n3" + if n > 1: error( - "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff" + "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff/--skipm1n3/--skipm2n3" ) sys.exit(-1) @@ -185,11 +191,13 @@ def load_include_exclude_db_patterns(args): def search_pattern(vals): return any(pattern.search(val) for val in vals) + elif args.exclude_db_pattern: pattern = re.compile(args.exclude_db_pattern, re.IGNORECASE) def search_pattern(vals): return all(not pattern.search(val) for val in vals) + else: search_pattern = None From d66d4aff840a32f988795a7c1a9227fdf054b115 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 01:43:16 +0000 Subject: [PATCH 17/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/sourmash/cli/utils.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 82a66dfe9b..467ed307fb 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -51,12 +51,12 @@ def add_moltype_args(parser): parser.set_defaults(hp=False) parser.add_argument( - "--skipm1n3", - "--skipmer-m1n3", - dest="skipm1n3", - action="store_true", - help="choose skipmer (m1n3) signatures", - ) + "--skipm1n3", + "--skipmer-m1n3", + dest="skipm1n3", + action="store_true", + help="choose skipmer (m1n3) signatures", + ) parser.add_argument( "--no-skipm1n3", @@ -68,12 +68,12 @@ def add_moltype_args(parser): parser.set_defaults(skipm1n3=False) parser.add_argument( - "--skipm2n3", - "--skipmer-m2n3", - dest="skipm2n3", - action="store_true", - help="choose skipmer (m2n3) signatures", - ) + "--skipm2n3", + "--skipmer-m2n3", + dest="skipm2n3", + action="store_true", + help="choose skipmer (m2n3) signatures", + ) parser.add_argument( "--no-skipm2n3", "--no-skipmer-m2n3", From 246a88fc43af6a1709aa8931c41c27599b12023c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 20 Dec 2024 09:53:08 -0800 Subject: [PATCH 18/22] upd bindgen --- include/sourmash.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/sourmash.h b/include/sourmash.h index f650035dec..33110912a5 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -13,6 +13,8 @@ enum HashFunctions { HASH_FUNCTIONS_MURMUR64_PROTEIN = 2, HASH_FUNCTIONS_MURMUR64_DAYHOFF = 3, HASH_FUNCTIONS_MURMUR64_HP = 4, + HASH_FUNCTIONS_MURMUR64_SKIPM1N3 = 5, + HASH_FUNCTIONS_MURMUR64_SKIPM2N3 = 6, }; typedef uint32_t HashFunctions; @@ -271,6 +273,10 @@ double kmerminhash_similarity(const SourmashKmerMinHash *ptr, bool ignore_abundance, bool downsample); +bool kmerminhash_skipm1n3(const SourmashKmerMinHash *ptr); + +bool kmerminhash_skipm2n3(const SourmashKmerMinHash *ptr); + void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize); bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); From 0c7ed1bcb612e082eb7a5c6ef744403f61b25657 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 20 Dec 2024 11:18:57 -0800 Subject: [PATCH 19/22] more moltype --- src/sourmash/minhash.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 7188ed7427..d7ed0a03d4 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -291,7 +291,7 @@ def __getstate__(self): # get a ksize that makes sense to the Rust layer. See #2262. return ( self.num, - self.ksize if self.is_dna else self.ksize * 3, + self.ksize if self.is_dna or self.skipm1n3 or self.skipm2n3 else self.ksize * 3, self.is_protein, self.dayhoff, self.hp, @@ -481,7 +481,7 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False): def add_kmer(self, kmer): "Add a kmer into the sketch." - if self.is_dna: + if self.is_dna or self.skipm1n3 or self.skipm2n3: if len(kmer) != self.ksize: raise ValueError(f"kmer to add is not {self.ksize} in length") else: @@ -608,7 +608,7 @@ def skipm2n3(self): @property def ksize(self): k = self._methodcall(lib.kmerminhash_ksize) - if not self.is_dna: + if not self.is_dna and not self.skipm1n3 and not self.skipm2n3: assert k % 3 == 0 k = int(k / 3) return k @@ -1081,6 +1081,10 @@ def moltype(self): # TODO: test in minhash tests return "dayhoff" elif self.hp: return "hp" + elif self.skipm1n3: + return "skipm1n3" + elif self.skipm2n3: + return "skipm2n3" else: return "DNA" From 379633ae6a5caca52c15a619ff3c18aa35f9b0cc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 19:19:10 +0000 Subject: [PATCH 20/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/sourmash/minhash.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index d7ed0a03d4..ce3c0ed599 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -291,7 +291,9 @@ def __getstate__(self): # get a ksize that makes sense to the Rust layer. See #2262. return ( self.num, - self.ksize if self.is_dna or self.skipm1n3 or self.skipm2n3 else self.ksize * 3, + self.ksize + if self.is_dna or self.skipm1n3 or self.skipm2n3 + else self.ksize * 3, self.is_protein, self.dayhoff, self.hp, From b6601f656d9a86b01b02236a734a0cff0ab753f6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 20 Dec 2024 11:52:48 -0800 Subject: [PATCH 21/22] fix index for scaled --- src/sourmash/minhash.py | 4 ++++ src/sourmash/sig/__main__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index ce3c0ed599..c036fa6c3d 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -289,6 +289,10 @@ def __getstate__(self): # note: we multiple ksize by 3 here so that # pickle protocols that bypass __setstate__ # get a ksize that makes sense to the Rust layer. See #2262. + # CTB/NTP note: if you add things below, you might want to put + # them at the end, because we use internal indexes in a few places. + # see especially `_set_num_scaled()` in sig/__main__.my. + # My apologies. return ( self.num, self.ksize diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 0b97c84347..5809270284 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -102,7 +102,7 @@ def _set_num_scaled(mh, num, scaled): # Number of hashes is 0th parameter mh_params[0] = num # Scale is 8th parameter - mh_params[8] = _get_max_hash_for_scaled(scaled) + mh_params[10] = _get_max_hash_for_scaled(scaled) mh.__setstate__(mh_params) assert mh.num == num assert mh.scaled == scaled From 017280a27bc03b934330c749348f6b0e9ecf327e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 20 Dec 2024 12:03:37 -0800 Subject: [PATCH 22/22] rustfmt --- src/core/src/ffi/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index 2a72fd549e..dc8fb14c76 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -43,7 +43,8 @@ pub enum HashFunctions { impl From for crate::encodings::HashFunctions { fn from(v: HashFunctions) -> crate::encodings::HashFunctions { use crate::encodings::HashFunctions::{ - Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, Murmur64Skipm2n3, + Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, + Murmur64Skipm2n3, }; match v { HashFunctions::Murmur64Dna => Murmur64Dna, @@ -59,7 +60,8 @@ impl From for crate::encodings::HashFunctions { impl From for HashFunctions { fn from(v: crate::encodings::HashFunctions) -> HashFunctions { use crate::encodings::HashFunctions::{ - Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, Murmur64Skipm2n3, + Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, + Murmur64Skipm2n3, }; match v { Murmur64Dna => HashFunctions::Murmur64Dna,