From 910b76c126fb223386879071cbb51466086c14fc Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 12 Nov 2024 16:13:27 -0800
Subject: [PATCH 01/22] try skipmers

---
 src/core/src/ffi/minhash.rs |  4 ++--
 src/core/src/signature.rs   | 48 +++++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 6 deletions(-)
diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs
index 7d7c050fb0..444609166d 100644
--- a/src/core/src/ffi/minhash.rs
+++ b/src/core/src/ffi/minhash.rs
@@ -74,14 +74,14 @@ Result<*const u64> {
     let mut output: Vec<u64> = Vec::with_capacity(insize);
 
     if force && bad_kmers_as_zeroes{
-        for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){
+        for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed(), false){
             match hash_value{
                 Ok(x) => output.push(x),
                 Err(err) => return Err(err),
             }
         }
     }else{
-        for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){
+        for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed(), false){
             match hash_value{
                 Ok(0) => continue,
                 Ok(x) => output.push(x),
diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index 0ab8190f98..191f1d89f9 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -43,6 +43,7 @@ pub trait SigsTrait {
             false,
             self.hash_function(),
             self.seed(),
+            false,
         );
 
         for hash_value in ready_hashes {
@@ -65,6 +66,7 @@ pub trait SigsTrait {
             true,
             self.hash_function(),
             self.seed(),
+            false,
         );
 
         for hash_value in ready_hashes {
@@ -174,6 +176,7 @@ pub struct SeqToHashes {
     hash_function: HashFunctions,
     seed: u64,
     hashes_buffer: Vec<u64>,
+    is_skipmer: bool,
 
     dna_configured: bool,
     dna_rc: Vec<u8>,
@@ -194,6 +197,7 @@ impl SeqToHashes {
         is_protein: bool,
         hash_function: HashFunctions,
         seed: u64,
+        is_skipmer: bool,
     ) -> SeqToHashes {
         let mut ksize: usize = k_size;
 
@@ -220,6 +224,7 @@ impl SeqToHashes {
             hash_function,
             seed,
             hashes_buffer: Vec::with_capacity(1000),
+            is_skipmer,
             dna_configured: false,
             dna_rc: Vec::with_capacity(1000),
             dna_ksize: 0,
@@ -266,8 +271,24 @@ impl Iterator for SeqToHashes {
 
                 // Processing DNA
                 if self.hash_function.dna() {
-                    let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize];
+                    // Generate skipmer: skip every third base
+                    let kmer: Vec<u8> = if self.is_skipmer {
+                        // Adjust length to capture enough bases even with skipping
+                        let extended_length = self.dna_ksize + self.dna_ksize / 2; // extend by half to compensate
+                        self.sequence[self.kmer_index..self.kmer_index + extended_length]
+                            .iter()
+                            .enumerate()
+                            .filter(|&(i, _)| i % 3 != 2) // skip every third letter
+                            .take(self.dna_ksize) // limit to the desired final length
+                            .map(|(_, &base)| base)
+                            .collect()
+                    } else {
+                        // Regular k-mer
+                        self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize].to_vec()
+                    };
+                    // let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize];
 
+                    // validate k-mer
                     for j in std::cmp::max(self.kmer_index, self.dna_last_position_check)
                         ..self.kmer_index + self.dna_ksize
                     {
@@ -299,9 +320,28 @@ impl Iterator for SeqToHashes {
                     // (leaving this table here because I had to draw to
                     //  get the indices correctly)
 
-                    let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index
-                        ..self.dna_len - self.kmer_index];
-                    let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed);
+                    // let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index
+                    //     ..self.dna_len - self.kmer_index];
+
+                    let krc: Vec<u8> = if self.is_skipmer {
+                        // Generate skipmer for reverse complement
+                        let extended_length = self.dna_ksize + self.dna_ksize / 2; // extend by half to compensate
+                        self.dna_rc[self.dna_len - extended_length - self.kmer_index
+                            ..self.dna_len - self.kmer_index]
+                            .iter()
+                            .enumerate()
+                            .filter(|&(i, _)| i % 3 != 2) // skip every third letter
+                            .take(self.dna_ksize) // limit to the desired final length
+                            .map(|(_, &base)| base)
+                            .collect()
+                    } else {
+                        // reg revcomp
+                        self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index
+                            ..self.dna_len - self.kmer_index]
+                            .to_vec()
+                    };
+
+                    let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed);
                     self.kmer_index += 1;
                     Some(Ok(hash))
                 } else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 {

From 2bac5fd6a191297c203f09adedc3033821f93a97 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Wed, 13 Nov 2024 16:57:01 -0800
Subject: [PATCH 02/22] use hash fn encoding instead; init testing

---
 src/core/src/encodings.rs   |   6 ++
 src/core/src/ffi/minhash.rs |   4 +-
 src/core/src/signature.rs   | 128 ++++++++++++++++++++++++------------
 3 files changed, 93 insertions(+), 45 deletions(-)

diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs
index 8375699cb0..f273e14770 100644
--- a/src/core/src/encodings.rs
+++ b/src/core/src/encodings.rs
@@ -31,6 +31,7 @@ pub enum HashFunctions {
     Murmur64Protein,
     Murmur64Dayhoff,
     Murmur64Hp,
+    Murmur64Skip,
     Custom(String),
 }
 
@@ -50,6 +51,9 @@ impl HashFunctions {
     pub fn hp(&self) -> bool {
         *self == HashFunctions::Murmur64Hp
     }
+    pub fn skipmer(&self) -> bool {
+        *self == HashFunctions::Murmur64Skip
+    }
 }
 
 impl std::fmt::Display for HashFunctions {
@@ -62,6 +66,7 @@ impl std::fmt::Display for HashFunctions {
                 HashFunctions::Murmur64Protein => "protein",
                 HashFunctions::Murmur64Dayhoff => "dayhoff",
                 HashFunctions::Murmur64Hp => "hp",
+                HashFunctions::Murmur64Skip => "skipmer",
                 HashFunctions::Custom(v) => v,
             }
         )
@@ -77,6 +82,7 @@ impl TryFrom<&str> for HashFunctions {
             "dayhoff" => Ok(HashFunctions::Murmur64Dayhoff),
             "hp" => Ok(HashFunctions::Murmur64Hp),
             "protein" => Ok(HashFunctions::Murmur64Protein),
+            "skipmer" => Ok(HashFunctions::Murmur64Skip),
             v => unimplemented!("{v}"),
         }
     }
diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs
index ac324f5c10..55879dd060 100644
--- a/src/core/src/ffi/minhash.rs
+++ b/src/core/src/ffi/minhash.rs
@@ -74,14 +74,14 @@ Result<*const u64> {
     let mut output: Vec<u64> = Vec::with_capacity(insize);
 
     if force && bad_kmers_as_zeroes{
-        for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed(), false){
+        for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){
             match hash_value{
                 Ok(x) => output.push(x),
                 Err(err) => return Err(err),
             }
         }
     }else{
-        for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed(), false){
+        for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){
             match hash_value{
                 Ok(0) => continue,
                 Ok(x) => output.push(x),
diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index b5fbc28bc7..ca47bc49ad 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -43,7 +43,6 @@ pub trait SigsTrait {
             false,
             self.hash_function(),
             self.seed(),
-            false,
         );
 
         for hash_value in ready_hashes {
@@ -66,7 +65,6 @@ pub trait SigsTrait {
             true,
             self.hash_function(),
             self.seed(),
-            false,
         );
 
         for hash_value in ready_hashes {
@@ -176,7 +174,6 @@ pub struct SeqToHashes {
     hash_function: HashFunctions,
     seed: u64,
     hashes_buffer: Vec<u64>,
-    is_skipmer: bool,
 
     dna_configured: bool,
     dna_rc: Vec<u8>,
@@ -197,12 +194,11 @@ impl SeqToHashes {
         is_protein: bool,
         hash_function: HashFunctions,
         seed: u64,
-        is_skipmer: bool,
     ) -> SeqToHashes {
         let mut ksize: usize = k_size;
 
         // Divide the kmer size by 3 if protein
-        if is_protein || !hash_function.dna() {
+        if is_protein || hash_function.protein() || hash_function.dayhoff() || hash_function.hp() {
             ksize = k_size / 3;
         }
 
@@ -224,7 +220,6 @@ impl SeqToHashes {
             hash_function,
             seed,
             hashes_buffer: Vec::with_capacity(1000),
-            is_skipmer,
             dna_configured: false,
             dna_rc: Vec::with_capacity(1000),
             dna_ksize: 0,
@@ -260,7 +255,11 @@ impl Iterator for SeqToHashes {
                     self.dna_ksize = self.k_size;
                     self.dna_len = self.sequence.len();
                     if self.dna_len < self.dna_ksize
-                        || (!self.hash_function.dna() && self.dna_len < self.k_size * 3)
+                        || (self.hash_function.protein() && self.dna_len < self.k_size * 3)
+                        || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3)
+                        || (self.hash_function.hp() && self.dna_len < self.k_size * 3)
+                        || (self.hash_function.skipmer()
+                            && self.dna_len < (self.k_size + self.k_size / 2))
                     {
                         return None;
                     }
@@ -271,24 +270,9 @@ impl Iterator for SeqToHashes {
 
                 // Processing DNA
                 if self.hash_function.dna() {
-                    // Generate skipmer: skip every third base
-                    let kmer: Vec<u8> = if self.is_skipmer {
-                        // Adjust length to capture enough bases even with skipping
-                        let extended_length = self.dna_ksize + self.dna_ksize / 2; // extend by half to compensate
-                        self.sequence[self.kmer_index..self.kmer_index + extended_length]
-                            .iter()
-                            .enumerate()
-                            .filter(|&(i, _)| i % 3 != 2) // skip every third letter
-                            .take(self.dna_ksize) // limit to the desired final length
-                            .map(|(_, &base)| base)
-                            .collect()
-                    } else {
-                        // Regular k-mer
-                        self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize].to_vec()
-                    };
-                    // let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize];
+                    let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize];
 
-                    // validate k-mer
+                    // validate the bases
                     for j in std::cmp::max(self.kmer_index, self.dna_last_position_check)
                         ..self.kmer_index + self.dna_ksize
                     {
@@ -320,26 +304,44 @@ impl Iterator for SeqToHashes {
                     // (leaving this table here because I had to draw to
                     //  get the indices correctly)
 
-                    // let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index
-                    //     ..self.dna_len - self.kmer_index];
+                    let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index
+                        ..self.dna_len - self.kmer_index];
+                    let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed);
+                    self.kmer_index += 1;
+                    Some(Ok(hash))
+                } else if self.hash_function.skipmer() {
+                    let extended_length = self.dna_ksize + self.dna_ksize / 2;
+                    // Build skipmer
+                    let kmer: Vec<u8> = self.sequence
+                        [self.kmer_index..self.kmer_index + extended_length]
+                        .iter()
+                        .enumerate()
+                        .filter(|&(i, _)| i % 3 != 2)
+                        .take(self.dna_ksize)
+                        .map(|(_, &base)| base)
+                        .collect();
+
+                    // check the bases are DNA
+                    if kmer.iter().any(|&base| !VALID[base as usize]) {
+                        if !self.force {
+                            return Some(Err(Error::InvalidDNA {
+                                message: String::from_utf8(kmer).unwrap(),
+                            }));
+                        } else {
+                            self.kmer_index += 1; // Move to the next position if forced
+                            return Some(Ok(0));
+                        }
+                    }
 
-                    let krc: Vec<u8> = if self.is_skipmer {
-                        // Generate skipmer for reverse complement
-                        let extended_length = self.dna_ksize + self.dna_ksize / 2; // extend by half to compensate
-                        self.dna_rc[self.dna_len - extended_length - self.kmer_index
-                            ..self.dna_len - self.kmer_index]
-                            .iter()
-                            .enumerate()
-                            .filter(|&(i, _)| i % 3 != 2) // skip every third letter
-                            .take(self.dna_ksize) // limit to the desired final length
-                            .map(|(_, &base)| base)
-                            .collect()
-                    } else {
-                        // reg revcomp
-                        self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index
-                            ..self.dna_len - self.kmer_index]
-                            .to_vec()
-                    };
+                    // Generate reverse complement skipmer
+                    let krc: Vec<u8> = self.dna_rc[self.dna_len - extended_length - self.kmer_index
+                        ..self.dna_len - self.kmer_index]
+                        .iter()
+                        .enumerate()
+                        .filter(|&(i, _)| i % 3 != 2)
+                        .take(self.dna_ksize)
+                        .map(|(_, &base)| base)
+                        .collect();
 
                     let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed);
                     self.kmer_index += 1;
@@ -958,6 +960,8 @@ mod test {
     use needletail::parse_fastx_reader;
 
     use crate::cmd::ComputeParameters;
+    use crate::encodings::HashFunctions;
+    use crate::signature::SeqToHashes;
     use crate::signature::SigsTrait;
 
     use super::Signature;
@@ -1316,4 +1320,42 @@ mod test {
             assert_eq!(modified_sig.size(), 0);
         }
     }
+
+    #[test]
+    fn test_seqtohashes_skipmer() {
+        let sequence = b"AGTCGTCAGTCG";
+        let k_size = 7;
+        let seed = 42;
+        let force = true; // Force skip over invalid bases if needed
+
+        // Initialize SeqToHashes iterator using the new constructor
+        let mut seq_to_hashes = SeqToHashes::new(
+            sequence,
+            k_size,
+            force,
+            false,
+            HashFunctions::Murmur64Skip,
+            seed,
+        );
+
+        // Define expected hashes for the skipmer configuration.
+        let expected_kmers = ["AGCGTAG", "GTGTCGT", "TCTCATC", "CGCAGCG"];
+        let expected_krc = ["CTACGCT", "ACGACAC", "GATGAGA", "CGCTGCG"];
+
+        // Compute expected hashes by hashing each k-mer with its reverse complement
+        let expected_hashes: Vec<u64> = expected_kmers
+            .iter()
+            .zip(expected_krc.iter())
+            .map(|(kmer, krc)| {
+                // Convert both kmer and krc to byte slices and pass to _hash_murmur
+                crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed)
+            })
+            .collect();
+
+        // Compare each produced hash from the iterator with the expected hash
+        for expected_hash in expected_hashes {
+            let hash = seq_to_hashes.next().unwrap().ok().unwrap();
+            assert_eq!(hash, expected_hash, "Mismatch in skipmer hash");
+        }
+    }
 }

From fdd1a44fda93db7f29f9862a2584ca60f870d7fa Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 14 Nov 2024 08:48:59 -0800
Subject: [PATCH 03/22] fix test

---
 src/core/src/signature.rs | 48 +++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index ca47bc49ad..d463639d16 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -1322,12 +1322,51 @@ mod test {
     }
 
     #[test]
-    fn test_seqtohashes_skipmer() {
-        let sequence = b"AGTCGTCAGTCG";
+    fn test_seqtohashes_dna() {
+        let sequence = b"AGTCGTCA";
         let k_size = 7;
         let seed = 42;
         let force = true; // Force skip over invalid bases if needed
 
+        // Initialize SeqToHashes iterator using the new constructor
+        let mut seq_to_hashes = SeqToHashes::new(
+            sequence,
+            k_size,
+            force,
+            false,
+            HashFunctions::Murmur64Dna,
+            seed,
+        );
+
+        // Define expected hashes for the kmer configuration.
+        let expected_kmers = ["AGTCGTC", "GTCGTCA"];
+        let expected_krc = ["GACGACT", "TGACGAC"];
+
+        // Compute expected hashes by hashing each k-mer with its reverse complement
+        let expected_hashes: Vec<u64> = expected_kmers
+            .iter()
+            .zip(expected_krc.iter())
+            .map(|(kmer, krc)| {
+                // Convert both kmer and krc to byte slices and pass to _hash_murmur
+                crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed)
+            })
+            .collect();
+
+        // Compare each produced hash from the iterator with the expected hash
+        for expected_hash in expected_hashes {
+            let hash = seq_to_hashes.next().unwrap().ok().unwrap();
+            assert_eq!(hash, expected_hash, "Mismatch in skipmer hash");
+        }
+    }
+
+    #[test]
+    fn test_seqtohashes_skipmer() {
+        let sequence = b"AGTCGTCA";
+        // let rc_seq = b"TGACGACT";
+        let k_size = 5;
+        let seed = 42;
+        let force = true; // Force skip over invalid bases if needed
+
         // Initialize SeqToHashes iterator using the new constructor
         let mut seq_to_hashes = SeqToHashes::new(
             sequence,
@@ -1339,8 +1378,9 @@ mod test {
         );
 
         // Define expected hashes for the skipmer configuration.
-        let expected_kmers = ["AGCGTAG", "GTGTCGT", "TCTCATC", "CGCAGCG"];
-        let expected_krc = ["CTACGCT", "ACGACAC", "GATGAGA", "CGCTGCG"];
+        let expected_kmers = ["AGCGC", "GTGTA"];
+        // rc of the k-mer, not of the sequence, then skipmerized. Correct?
+        let expected_krc = ["GCGCT", "TACAC"];
 
         // Compute expected hashes by hashing each k-mer with its reverse complement
         let expected_hashes: Vec<u64> = expected_kmers

From ff33f7b57bc9d11af7aaa71d2bf8f449be983f16 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 19 Nov 2024 13:19:44 -0800
Subject: [PATCH 04/22] cont

---
 src/core/src/encodings.rs      |  8 +++----
 src/core/src/signature.rs      | 39 +++++++++++++++++++++-------------
 src/core/src/sketch/minhash.rs |  1 +
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs
index f273e14770..0aba272a5c 100644
--- a/src/core/src/encodings.rs
+++ b/src/core/src/encodings.rs
@@ -31,7 +31,7 @@ pub enum HashFunctions {
     Murmur64Protein,
     Murmur64Dayhoff,
     Murmur64Hp,
-    Murmur64Skip,
+    Murmur64Skipmer,
     Custom(String),
 }
 
@@ -52,7 +52,7 @@ impl HashFunctions {
         *self == HashFunctions::Murmur64Hp
     }
     pub fn skipmer(&self) -> bool {
-        *self == HashFunctions::Murmur64Skip
+        *self == HashFunctions::Murmur64Skipmer
     }
 }
 
@@ -66,7 +66,7 @@ impl std::fmt::Display for HashFunctions {
                 HashFunctions::Murmur64Protein => "protein",
                 HashFunctions::Murmur64Dayhoff => "dayhoff",
                 HashFunctions::Murmur64Hp => "hp",
-                HashFunctions::Murmur64Skip => "skipmer",
+                HashFunctions::Murmur64Skipmer => "skipmer",
                 HashFunctions::Custom(v) => v,
             }
         )
@@ -82,7 +82,7 @@ impl TryFrom<&str> for HashFunctions {
             "dayhoff" => Ok(HashFunctions::Murmur64Dayhoff),
             "hp" => Ok(HashFunctions::Murmur64Hp),
             "protein" => Ok(HashFunctions::Murmur64Protein),
-            "skipmer" => Ok(HashFunctions::Murmur64Skip),
+            "skipmer" => Ok(HashFunctions::Murmur64Skipmer),
             v => unimplemented!("{v}"),
         }
     }
diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index d463639d16..53d186fe54 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -230,6 +230,19 @@ impl SeqToHashes {
             translate_iter_step: 0,
         }
     }
+
+    fn validate_base(&self, base: u8, kmer: &Vec<u8>) -> Option<Result<u64, Error>> {
+        if !VALID[base as usize] {
+            if !self.force {
+                return Some(Err(Error::InvalidDNA {
+                    message: String::from_utf8(kmer.clone()).unwrap_or_default(),
+                }));
+            } else {
+                return Some(Ok(0)); // Skip this position if forced
+            }
+        }
+        None // Base is valid, so return None to continue
+    }
 }
 
 /*
@@ -311,26 +324,22 @@ impl Iterator for SeqToHashes {
                     Some(Ok(hash))
                 } else if self.hash_function.skipmer() {
                     let extended_length = self.dna_ksize + self.dna_ksize / 2;
-                    // Build skipmer
-                    let kmer: Vec<u8> = self.sequence
+
+                    // Build skipmer with DNA base validation
+                    let mut kmer: Vec<u8> = Vec::with_capacity(self.dna_ksize);
+                    for (_i, &base) in self.sequence
                         [self.kmer_index..self.kmer_index + extended_length]
                         .iter()
                         .enumerate()
                         .filter(|&(i, _)| i % 3 != 2)
                         .take(self.dna_ksize)
-                        .map(|(_, &base)| base)
-                        .collect();
-
-                    // check the bases are DNA
-                    if kmer.iter().any(|&base| !VALID[base as usize]) {
-                        if !self.force {
-                            return Some(Err(Error::InvalidDNA {
-                                message: String::from_utf8(kmer).unwrap(),
-                            }));
-                        } else {
-                            self.kmer_index += 1; // Move to the next position if forced
-                            return Some(Ok(0));
+                    {
+                        // Use the validate_base method to check the base
+                        if let Some(result) = self.validate_base(base, &kmer) {
+                            self.kmer_index += 1; // Move to the next position if skipping is forced
+                            return Some(result);
                         }
+                        kmer.push(base);
                     }
 
                     // Generate reverse complement skipmer
@@ -1373,7 +1382,7 @@ mod test {
             k_size,
             force,
             false,
-            HashFunctions::Murmur64Skip,
+            HashFunctions::Murmur64Skipmer,
             seed,
         );
 
diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs
index 438294e098..3c8f3b4240 100644
--- a/src/core/src/sketch/minhash.rs
+++ b/src/core/src/sketch/minhash.rs
@@ -153,6 +153,7 @@ impl<'de> Deserialize<'de> for KmerMinHash {
             "dayhoff" => HashFunctions::Murmur64Dayhoff,
             "hp" => HashFunctions::Murmur64Hp,
             "dna" => HashFunctions::Murmur64Dna,
+            "skipmer" => HashFunctions::Murmur64Skipmer,
             _ => unimplemented!(), // TODO: throw error here
         };
 

From 11978ed84cb9078e36128f3d55f5bb34eaad4d3e Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 19 Nov 2024 15:01:21 -0800
Subject: [PATCH 05/22] test

---
 src/core/src/cmd.rs       | 21 +++++++++++++++++++++
 src/core/src/signature.rs | 30 +++++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs
index b59e99b912..115d2672af 100644
--- a/src/core/src/cmd.rs
+++ b/src/core/src/cmd.rs
@@ -42,6 +42,10 @@ pub struct ComputeParameters {
     #[builder(default = false)]
     hp: bool,
 
+    #[getset(get_copy = "pub", set = "pub")]
+    #[builder(default = false)]
+    skipmer: bool,
+
     #[getset(get_copy = "pub", set = "pub")]
     #[builder(default = false)]
     singleton: bool,
@@ -165,6 +169,23 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
                 ));
             }
 
+            if params.skipmer {
+                ksigs.push(Sketch::LargeMinHash(
+                    KmerMinHashBTree::builder()
+                        .num(params.num_hashes)
+                        .ksize(*k)
+                        .hash_function(HashFunctions::Murmur64Skipmer)
+                        .max_hash(max_hash)
+                        .seed(params.seed)
+                        .abunds(if params.track_abundance {
+                            Some(Default::default())
+                        } else {
+                            None
+                        })
+                        .build(),
+                ));
+            }
+
             if params.dna {
                 ksigs.push(Sketch::LargeMinHash(
                     KmerMinHashBTree::builder()
diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index 53d186fe54..8013e506ea 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -272,7 +272,8 @@ impl Iterator for SeqToHashes {
                         || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3)
                         || (self.hash_function.hp() && self.dna_len < self.k_size * 3)
                         || (self.hash_function.skipmer()
-                            && self.dna_len < (self.k_size + self.k_size / 2))
+                            // add 1 to round up rather than down
+                            && self.dna_len < (self.k_size + ((self.k_size + 1) / 2) - 1))
                     {
                         return None;
                     }
@@ -323,7 +324,12 @@ impl Iterator for SeqToHashes {
                     self.kmer_index += 1;
                     Some(Ok(hash))
                 } else if self.hash_function.skipmer() {
-                    let extended_length = self.dna_ksize + self.dna_ksize / 2;
+                    let extended_length = self.dna_ksize + ((self.dna_ksize + 1) / 2) - 1; // add 1 to round up rather than down
+
+                    // Check bounds to ensure we don't exceed the sequence length
+                    if self.kmer_index + extended_length > self.sequence.len() {
+                        return None;
+                    }
 
                     // Build skipmer with DNA base validation
                     let mut kmer: Vec<u8> = Vec::with_capacity(self.dna_ksize);
@@ -1091,6 +1097,24 @@ mod test {
         assert_eq!(sig.signatures[1].size(), 2);
     }
 
+    #[test]
+    fn signature_skipmer_add_sequence() {
+        let params = ComputeParameters::builder()
+            .ksizes(vec![3, 6])
+            .num_hashes(3u32)
+            .dna(false)
+            .skipmer(true)
+            .build();
+
+        let mut sig = Signature::from_params(&params);
+        sig.add_sequence(b"ATGCATGA", false).unwrap();
+
+        assert_eq!(sig.signatures.len(), 2);
+        dbg!(&sig.signatures);
+        assert_eq!(sig.signatures[0].size(), 3);
+        assert_eq!(sig.signatures[1].size(), 1);
+    }
+
     #[test]
     fn signature_add_sequence_cp() {
         let mut cp = ComputeParameters::default();
@@ -1364,7 +1388,7 @@ mod test {
         // Compare each produced hash from the iterator with the expected hash
         for expected_hash in expected_hashes {
             let hash = seq_to_hashes.next().unwrap().ok().unwrap();
-            assert_eq!(hash, expected_hash, "Mismatch in skipmer hash");
+            assert_eq!(hash, expected_hash, "Mismatch in DNA hash");
         }
     }
 

From 2de520eeea6b4b75844030c512d892dcf48b4b3c Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 19 Nov 2024 15:15:41 -0800
Subject: [PATCH 06/22] skipmers must be at least ksize 3

---
 src/core/src/signature.rs | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index 8013e506ea..3f4d98cb51 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -324,6 +324,11 @@ impl Iterator for SeqToHashes {
                     self.kmer_index += 1;
                     Some(Ok(hash))
                 } else if self.hash_function.skipmer() {
+                    // check that we can actually build skipmers
+                    if self.k_size < 3 {
+                        unimplemented!()
+                        // return None
+                    }
                     let extended_length = self.dna_ksize + ((self.dna_ksize + 1) / 2) - 1; // add 1 to round up rather than down
 
                     // Check bounds to ensure we don't exceed the sequence length
@@ -1100,7 +1105,7 @@ mod test {
     #[test]
     fn signature_skipmer_add_sequence() {
         let params = ComputeParameters::builder()
-            .ksizes(vec![3, 6])
+            .ksizes(vec![3, 4, 5, 6])
             .num_hashes(3u32)
             .dna(false)
             .skipmer(true)
@@ -1109,10 +1114,26 @@ mod test {
         let mut sig = Signature::from_params(&params);
         sig.add_sequence(b"ATGCATGA", false).unwrap();
 
-        assert_eq!(sig.signatures.len(), 2);
+        assert_eq!(sig.signatures.len(), 4);
         dbg!(&sig.signatures);
         assert_eq!(sig.signatures[0].size(), 3);
-        assert_eq!(sig.signatures[1].size(), 1);
+        assert_eq!(sig.signatures[1].size(), 3);
+        assert_eq!(sig.signatures[2].size(), 2);
+        assert_eq!(sig.signatures[3].size(), 1);
+    }
+
+    #[test]
+    #[should_panic(expected = "not implemented")]
+    fn signature_skipmer_add_sequence_too_small() {
+        let params = ComputeParameters::builder()
+            .ksizes(vec![2])
+            .num_hashes(3u32)
+            .dna(false)
+            .skipmer(true)
+            .build();
+
+        let mut sig = Signature::from_params(&params);
+        sig.add_sequence(b"ATGCATGA", false).unwrap();
     }
 
     #[test]

From d4a620094ba90a86dcd18197732f2bb18f7266f5 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 19 Nov 2024 17:20:21 -0800
Subject: [PATCH 07/22] clippy fix

---
 src/core/src/signature.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index 3f4d98cb51..93580170a2 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -231,11 +231,11 @@ impl SeqToHashes {
         }
     }
 
-    fn validate_base(&self, base: u8, kmer: &Vec<u8>) -> Option<Result<u64, Error>> {
+    fn validate_base(&self, base: u8, kmer: &[u8]) -> Option<Result<u64, Error>> {
         if !VALID[base as usize] {
             if !self.force {
                 return Some(Err(Error::InvalidDNA {
-                    message: String::from_utf8(kmer.clone()).unwrap_or_default(),
+                    message: String::from_utf8(kmer.to_owned()).unwrap_or_default(),
                 }));
             } else {
                 return Some(Ok(0)); // Skip this position if forced

From 96aea47337f98289f5eda6913c37d57b5cc2ebef Mon Sep 17 00:00:00 2001
From: Tessa Pierce Ward <bluegenes@users.noreply.github.com>
Date: Thu, 12 Dec 2024 13:57:47 -0800
Subject: [PATCH 08/22] WIP: skipmer improvements (#3415)

Make skipmers robust, but keep #3395 functional in the meantime.

This PR:
- enables second skipmer types, so we have m1n3 in addition to m2n3
- switches to a reading frame approach for both translation + skipmers,
which means we first build the reading frame, then kmerize, rather than
building kmers + translating/skipping on the fly
- avoids "extended length" needed for skipping on the fly

Since this changes the `SeqToHashes` strategy a bit, there's one python
test where we now see a different error.

Future thoughts:
- with the new structure, it would be straightforward to add validation
to exclude protein k-mers with invalid amino acids (`X`). I guess I'm
not entirely sure what happens to those atm...
---
 src/core/src/cmd.rs            |  27 +-
 src/core/src/encodings.rs      |  18 +-
 src/core/src/signature.rs      | 928 +++++++++++++++++++++++----------
 src/core/src/sketch/minhash.rs |   3 +-
 tests/test_sourmash_sketch.py  |   3 +-
 5 files changed, 707 insertions(+), 272 deletions(-)

diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs
index 115d2672af..d5bfb092ba 100644
--- a/src/core/src/cmd.rs
+++ b/src/core/src/cmd.rs
@@ -44,7 +44,11 @@ pub struct ComputeParameters {
 
     #[getset(get_copy = "pub", set = "pub")]
     #[builder(default = false)]
-    skipmer: bool,
+    skipm1n3: bool,
+
+    #[getset(get_copy = "pub", set = "pub")]
+    #[builder(default = false)]
+    skipm2n3: bool,
 
     #[getset(get_copy = "pub", set = "pub")]
     #[builder(default = false)]
@@ -169,12 +173,29 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
                 ));
             }
 
-            if params.skipmer {
+            if params.skipm1n3 {
+                ksigs.push(Sketch::LargeMinHash(
+                    KmerMinHashBTree::builder()
+                        .num(params.num_hashes)
+                        .ksize(*k)
+                        .hash_function(HashFunctions::Murmur64Skipm1n3)
+                        .max_hash(max_hash)
+                        .seed(params.seed)
+                        .abunds(if params.track_abundance {
+                            Some(Default::default())
+                        } else {
+                            None
+                        })
+                        .build(),
+                ));
+            }
+
+            if params.skipm2n3 {
                 ksigs.push(Sketch::LargeMinHash(
                     KmerMinHashBTree::builder()
                         .num(params.num_hashes)
                         .ksize(*k)
-                        .hash_function(HashFunctions::Murmur64Skipmer)
+                        .hash_function(HashFunctions::Murmur64Skipm2n3)
                         .max_hash(max_hash)
                         .seed(params.seed)
                         .abunds(if params.track_abundance {
diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs
index 0aba272a5c..56077c80e1 100644
--- a/src/core/src/encodings.rs
+++ b/src/core/src/encodings.rs
@@ -31,7 +31,8 @@ pub enum HashFunctions {
     Murmur64Protein,
     Murmur64Dayhoff,
     Murmur64Hp,
-    Murmur64Skipmer,
+    Murmur64Skipm1n3,
+    Murmur64Skipm2n3,
     Custom(String),
 }
 
@@ -51,8 +52,13 @@ impl HashFunctions {
     pub fn hp(&self) -> bool {
         *self == HashFunctions::Murmur64Hp
     }
-    pub fn skipmer(&self) -> bool {
-        *self == HashFunctions::Murmur64Skipmer
+
+    pub fn skipm1n3(&self) -> bool {
+        *self == HashFunctions::Murmur64Skipm1n3
+    }
+
+    pub fn skipm2n3(&self) -> bool {
+        *self == HashFunctions::Murmur64Skipm2n3
     }
 }
 
@@ -66,7 +72,8 @@ impl std::fmt::Display for HashFunctions {
                 HashFunctions::Murmur64Protein => "protein",
                 HashFunctions::Murmur64Dayhoff => "dayhoff",
                 HashFunctions::Murmur64Hp => "hp",
-                HashFunctions::Murmur64Skipmer => "skipmer",
+                HashFunctions::Murmur64Skipm1n3 => "skipm1n3",
+                HashFunctions::Murmur64Skipm2n3 => "skipm2n3",
                 HashFunctions::Custom(v) => v,
             }
         )
@@ -82,7 +89,8 @@ impl TryFrom<&str> for HashFunctions {
             "dayhoff" => Ok(HashFunctions::Murmur64Dayhoff),
             "hp" => Ok(HashFunctions::Murmur64Hp),
             "protein" => Ok(HashFunctions::Murmur64Protein),
-            "skipmer" => Ok(HashFunctions::Murmur64Skipmer),
+            "skipm1n3" => Ok(HashFunctions::Murmur64Skipm1n3),
+            "skipm2n3" => Ok(HashFunctions::Murmur64Skipm2n3),
             v => unimplemented!("{v}"),
         }
     }
diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index 93580170a2..5cc60ca299 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -10,6 +10,7 @@ use std::path::Path;
 use std::str;
 
 use cfg_if::cfg_if;
+use itertools::Itertools;
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
@@ -163,27 +164,149 @@ impl SigsTrait for Sketch {
     }
 }
 
-// Iterator for converting sequence to hashes
+#[derive(Debug, Clone)]
+pub enum ReadingFrame {
+    DNA {
+        fw: Vec<u8>,
+        rc: Vec<u8>,
+        len: usize, // len gives max_index for kmer iterator
+    },
+    Protein {
+        fw: Vec<u8>,
+        len: usize,
+    },
+}
+
+impl std::fmt::Display for ReadingFrame {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ReadingFrame::DNA { fw, rc, len } => {
+                let fw_str = String::from_utf8_lossy(fw).to_string();
+                let rc_str = String::from_utf8_lossy(rc).to_string();
+                write!(
+                    f,
+                    "Type: DNA ({}bp), Forward: {}, Reverse Complement: {}",
+                    len, fw_str, rc_str
+                )
+            }
+            ReadingFrame::Protein { fw, len } => {
+                let fw_str = String::from_utf8_lossy(fw).to_string();
+                write!(f, "Type: Protein ({}aa), Forward: {}", len, fw_str)
+            }
+        }
+    }
+}
+
+impl ReadingFrame {
+    pub fn new_dna(sequence: &[u8]) -> Self {
+        let fw = sequence.to_ascii_uppercase();
+        let rc = revcomp(&fw);
+        let len = sequence.len();
+        ReadingFrame::DNA { fw, rc, len }
+    }
+
+    pub fn new_protein(sequence: &[u8], dayhoff: bool, hp: bool) -> Self {
+        let seq = sequence.to_ascii_uppercase();
+        let fw: Vec<u8> = if dayhoff {
+            seq.iter().map(|&aa| aa_to_dayhoff(aa)).collect()
+        } else if hp {
+            seq.iter().map(|&aa| aa_to_hp(aa)).collect()
+        } else {
+            seq
+        };
+
+        let len = fw.len();
+        ReadingFrame::Protein { fw, len }
+    }
+
+    pub fn new_skipmer(sequence: &[u8], start: usize, m: usize, n: usize) -> Self {
+        let seq = sequence.to_ascii_uppercase();
+        if start >= n {
+            panic!("Skipmer frame number must be < n ({})", n);
+        }
+        // do we need to round up? (+1)
+        let mut fw = Vec::with_capacity(((seq.len() * m) + 1) / n);
+        seq.iter().skip(start).enumerate().for_each(|(i, &base)| {
+            if i % n < m {
+                fw.push(base.to_ascii_uppercase());
+            }
+        });
+
+        let len = fw.len();
+        let rc = revcomp(&fw);
+        ReadingFrame::DNA { fw, rc, len }
+    }
+
+    // this is the only one that doesn't uppercase in here b/c more efficient to uppercase externally :/
+    pub fn new_translated(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Self {
+        if frame_number > 2 {
+            panic!("Frame number must be 0, 1, or 2");
+        }
+
+        // Translate sequence into amino acids
+        let mut fw = Vec::with_capacity(sequence.len() / 3);
+        // NOTE: b/c of chunks(3), we only process full codons and ignore leftover bases (e.g. 1 or 2 at end of frame)
+        sequence
+            .iter()
+            .skip(frame_number) // Skip the initial bases for the frame
+            .take(sequence.len() - frame_number) // Adjust length based on skipped bases
+            .chunks(3) // Group into codons (triplets) using itertools
+            .into_iter()
+            .filter_map(|chunk| {
+                let codon: Vec<u8> = chunk.cloned().collect(); // Collect the chunk into a Vec<u8>
+                to_aa(&codon, dayhoff, hp).ok() // Translate the codon
+            })
+            .for_each(|aa| fw.extend(aa)); // Extend `fw` with amino acids
+
+        let len = fw.len();
+
+        // return protein reading frame
+        ReadingFrame::Protein { fw, len }
+    }
+
+    /// Get the forward sequence.
+    #[inline]
+    pub fn fw(&self) -> &[u8] {
+        match self {
+            ReadingFrame::DNA { fw, .. } => fw,
+            ReadingFrame::Protein { fw, .. } => fw,
+        }
+    }
+
+    /// Get the reverse complement sequence (if DNA).
+    #[inline]
+    pub fn rc(&self) -> &[u8] {
+        match self {
+            ReadingFrame::DNA { rc, .. } => rc,
+            _ => panic!("Reverse complement is only available for DNA frames"),
+        }
+    }
+
+    #[inline]
+    pub fn length(&self) -> usize {
+        match self {
+            ReadingFrame::DNA { len, .. } => *len,
+            ReadingFrame::Protein { len, .. } => *len,
+        }
+    }
+
+    /// Get the type of the frame as a string.
+    pub fn frame_type(&self) -> &'static str {
+        match self {
+            ReadingFrame::DNA { .. } => "DNA",
+            ReadingFrame::Protein { .. } => "Protein",
+        }
+    }
+}
+
 pub struct SeqToHashes {
-    sequence: Vec<u8>,
-    kmer_index: usize,
     k_size: usize,
-    max_index: usize,
     force: bool,
-    is_protein: bool,
-    hash_function: HashFunctions,
     seed: u64,
-    hashes_buffer: Vec<u64>,
-
-    dna_configured: bool,
-    dna_rc: Vec<u8>,
-    dna_ksize: usize,
-    dna_len: usize,
-    dna_last_position_check: usize,
-
-    prot_configured: bool,
-    aa_seq: Vec<u8>,
-    translate_iter_step: usize,
+    frames: Vec<ReadingFrame>,
+    frame_index: usize,         // Index of the current frame
+    kmer_index: usize,          // Current k-mer index within the frame
+    last_position_check: usize, // Index of last base we validated
 }
 
 impl SeqToHashes {
@@ -194,118 +317,141 @@ impl SeqToHashes {
         is_protein: bool,
         hash_function: HashFunctions,
         seed: u64,
-    ) -> SeqToHashes {
+    ) -> Self {
         let mut ksize: usize = k_size;
 
-        // Divide the kmer size by 3 if protein
+        // Adjust kmer size for protein-based hash functions
         if is_protein || hash_function.protein() || hash_function.dayhoff() || hash_function.hp() {
             ksize = k_size / 3;
         }
 
-        // By setting _max_index to 0, the iterator will return None and exit
-        let _max_index = if seq.len() >= ksize {
-            seq.len() - ksize + 1
+        // Generate frames based on sequence type and hash function
+        let frames = if hash_function.dna() {
+            Self::dna_frames(seq)
+        } else if is_protein {
+            Self::protein_frames(seq, &hash_function)
+        } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() {
+            Self::translated_frames(seq, &hash_function)
+        } else if hash_function.skipm1n3() || hash_function.skipm2n3() {
+            Self::skipmer_frames(seq, &hash_function, ksize)
         } else {
-            0
+            unimplemented!();
         };
 
         SeqToHashes {
-            // Here we convert the sequence to upper case
-            sequence: seq.to_ascii_uppercase(),
             k_size: ksize,
-            kmer_index: 0,
-            max_index: _max_index,
             force,
-            is_protein,
-            hash_function,
             seed,
-            hashes_buffer: Vec::with_capacity(1000),
-            dna_configured: false,
-            dna_rc: Vec::with_capacity(1000),
-            dna_ksize: 0,
-            dna_len: 0,
-            dna_last_position_check: 0,
-            prot_configured: false,
-            aa_seq: Vec::new(),
-            translate_iter_step: 0,
+            frames,
+            frame_index: 0,
+            kmer_index: 0,
+            last_position_check: 0,
         }
     }
 
-    fn validate_base(&self, base: u8, kmer: &[u8]) -> Option<Result<u64, Error>> {
-        if !VALID[base as usize] {
-            if !self.force {
-                return Some(Err(Error::InvalidDNA {
-                    message: String::from_utf8(kmer.to_owned()).unwrap_or_default(),
-                }));
-            } else {
-                return Some(Ok(0)); // Skip this position if forced
-            }
+    /// generate frames from DNA: 1 DNA frame (fw+rc)
+    fn dna_frames(seq: &[u8]) -> Vec<ReadingFrame> {
+        vec![ReadingFrame::new_dna(seq)]
+    }
+
+    /// generate frames from protein: 1 protein frame
+    fn protein_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec<ReadingFrame> {
+        vec![ReadingFrame::new_protein(
+            seq,
+            hash_function.dayhoff(),
+            hash_function.hp(),
+        )]
+    }
+
+    /// generate translated frames: 6 protein frames
+    fn translated_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec<ReadingFrame> {
+        // since we need to revcomp BEFORE making ReadingFrames, uppercase the sequence here
+        let sequence = seq.to_ascii_uppercase();
+        let revcomp_sequence = revcomp(&sequence);
+        (0..3)
+            .flat_map(|frame_number| {
+                vec![
+                    ReadingFrame::new_translated(
+                        &sequence,
+                        frame_number,
+                        hash_function.dayhoff(),
+                        hash_function.hp(),
+                    ),
+                    ReadingFrame::new_translated(
+                        &revcomp_sequence,
+                        frame_number,
+                        hash_function.dayhoff(),
+                        hash_function.hp(),
+                    ),
+                ]
+            })
+            .collect()
+    }
+
+    /// generate skipmer frames: 3 DNA frames (each with fw+rc)
+    fn skipmer_frames(
+        seq: &[u8],
+        hash_function: &HashFunctions,
+        ksize: usize,
+    ) -> Vec<ReadingFrame> {
+        let (m, n) = if hash_function.skipm1n3() {
+            (1, 3)
+        } else {
+            (2, 3)
+        };
+        if ksize < n {
+            unimplemented!()
         }
-        None // Base is valid, so return None to continue
+        (0..3)
+            .flat_map(|frame_number| vec![ReadingFrame::new_skipmer(seq, frame_number, m, n)])
+            .collect()
     }
-}
 
-/*
-Iterator that return a kmer hash for all modes except translate.
-In translate mode:
-    - all the frames are processed at once and converted to hashes.
-    - all the hashes are stored in `hashes_buffer`
-    - after processing all the kmers, `translate_iter_step` is incremented
-      per iteration to iterate over all the indeces of the `hashes_buffer`.
-    - the iterator will die once `translate_iter_step` == length(hashes_buffer)
-More info https://github.com/sourmash-bio/sourmash/pull/1946
-*/
+    fn out_of_bounds(&self, frame: &ReadingFrame) -> bool {
+        self.kmer_index + self.k_size > frame.length()
+    }
+}
 
 impl Iterator for SeqToHashes {
     type Item = Result<u64, Error>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        if (self.kmer_index < self.max_index) || !self.hashes_buffer.is_empty() {
-            // Processing DNA or Translated DNA
-            if !self.is_protein {
-                // Setting the parameters only in the first iteration
-                if !self.dna_configured {
-                    self.dna_ksize = self.k_size;
-                    self.dna_len = self.sequence.len();
-                    if self.dna_len < self.dna_ksize
-                        || (self.hash_function.protein() && self.dna_len < self.k_size * 3)
-                        || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3)
-                        || (self.hash_function.hp() && self.dna_len < self.k_size * 3)
-                        || (self.hash_function.skipmer()
-                            // add 1 to round up rather than down
-                            && self.dna_len < (self.k_size + ((self.k_size + 1) / 2) - 1))
-                    {
-                        return None;
-                    }
-                    // pre-calculate the reverse complement for the full sequence...
-                    self.dna_rc = revcomp(&self.sequence);
-                    self.dna_configured = true;
-                }
+        while self.frame_index < self.frames.len() {
+            let frame = &self.frames[self.frame_index];
+
+            // Do we need to move to the next frame?
+            if self.out_of_bounds(frame) {
+                self.frame_index += 1;
+                self.kmer_index = 0; // Reset for the next frame
+                self.last_position_check = 0;
+                continue;
+            }
 
-                // Processing DNA
-                if self.hash_function.dna() {
-                    let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize];
+            let result = match frame {
+                ReadingFrame::DNA { .. } => {
+                    let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size];
+                    let rc = frame.rc();
 
-                    // validate the bases
-                    for j in std::cmp::max(self.kmer_index, self.dna_last_position_check)
-                        ..self.kmer_index + self.dna_ksize
+                    // Validate k-mer bases
+                    for j in std::cmp::max(self.kmer_index, self.last_position_check)
+                        ..self.kmer_index + self.k_size
                     {
-                        if !VALID[self.sequence[j] as usize] {
+                        if !VALID[frame.fw()[j] as usize] {
                             if !self.force {
+                                // Return an error if force is false
                                 return Some(Err(Error::InvalidDNA {
                                     message: String::from_utf8(kmer.to_vec()).unwrap(),
                                 }));
                             } else {
+                                // Skip the invalid k-mer
                                 self.kmer_index += 1;
-                                // Move the iterator to the next step
                                 return Some(Ok(0));
                             }
                         }
-                        self.dna_last_position_check += 1;
+                        self.last_position_check += 1;
                     }
 
-                    // ... and then while moving the k-mer window forward for the sequence
-                    // we move another window backwards for the RC.
+                    // Compute canonical hash
                     //   For a ksize = 3, and a sequence AGTCGT (len = 6):
                     //                   +-+---------+---------------+-------+
                     //   seq      RC     |i|i + ksize|len - ksize - i|len - i|
@@ -317,146 +463,21 @@ impl Iterator for SeqToHashes {
                     //                   +-+---------+---------------+-------+
                     // (leaving this table here because I had to draw to
                     //  get the indices correctly)
-
-                    let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index
-                        ..self.dna_len - self.kmer_index];
+                    let krc = &rc[frame.length() - self.k_size - self.kmer_index
+                        ..frame.length() - self.kmer_index];
                     let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed);
-                    self.kmer_index += 1;
-                    Some(Ok(hash))
-                } else if self.hash_function.skipmer() {
-                    // check that we can actually build skipmers
-                    if self.k_size < 3 {
-                        unimplemented!()
-                        // return None
-                    }
-                    let extended_length = self.dna_ksize + ((self.dna_ksize + 1) / 2) - 1; // add 1 to round up rather than down
-
-                    // Check bounds to ensure we don't exceed the sequence length
-                    if self.kmer_index + extended_length > self.sequence.len() {
-                        return None;
-                    }
-
-                    // Build skipmer with DNA base validation
-                    let mut kmer: Vec<u8> = Vec::with_capacity(self.dna_ksize);
-                    for (_i, &base) in self.sequence
-                        [self.kmer_index..self.kmer_index + extended_length]
-                        .iter()
-                        .enumerate()
-                        .filter(|&(i, _)| i % 3 != 2)
-                        .take(self.dna_ksize)
-                    {
-                        // Use the validate_base method to check the base
-                        if let Some(result) = self.validate_base(base, &kmer) {
-                            self.kmer_index += 1; // Move to the next position if skipping is forced
-                            return Some(result);
-                        }
-                        kmer.push(base);
-                    }
-
-                    // Generate reverse complement skipmer
-                    let krc: Vec<u8> = self.dna_rc[self.dna_len - extended_length - self.kmer_index
-                        ..self.dna_len - self.kmer_index]
-                        .iter()
-                        .enumerate()
-                        .filter(|&(i, _)| i % 3 != 2)
-                        .take(self.dna_ksize)
-                        .map(|(_, &base)| base)
-                        .collect();
-
-                    let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed);
-                    self.kmer_index += 1;
-                    Some(Ok(hash))
-                } else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 {
-                    // Processing protein by translating DNA
-                    // TODO: Implement iterator over frames instead of hashes_buffer.
-
-                    for frame_number in 0..3 {
-                        let substr: Vec<u8> = self
-                            .sequence
-                            .iter()
-                            .cloned()
-                            .skip(frame_number)
-                            .take(self.sequence.len() - frame_number)
-                            .collect();
-
-                        let aa = to_aa(
-                            &substr,
-                            self.hash_function.dayhoff(),
-                            self.hash_function.hp(),
-                        )
-                        .unwrap();
-
-                        aa.windows(self.k_size).for_each(|n| {
-                            let hash = crate::_hash_murmur(n, self.seed);
-                            self.hashes_buffer.push(hash);
-                        });
-
-                        let rc_substr: Vec<u8> = self
-                            .dna_rc
-                            .iter()
-                            .cloned()
-                            .skip(frame_number)
-                            .take(self.dna_rc.len() - frame_number)
-                            .collect();
-                        let aa_rc = to_aa(
-                            &rc_substr,
-                            self.hash_function.dayhoff(),
-                            self.hash_function.hp(),
-                        )
-                        .unwrap();
-
-                        aa_rc.windows(self.k_size).for_each(|n| {
-                            let hash = crate::_hash_murmur(n, self.seed);
-                            self.hashes_buffer.push(hash);
-                        });
-                    }
-                    Some(Ok(0))
-                } else {
-                    if self.translate_iter_step == self.hashes_buffer.len() {
-                        self.hashes_buffer.clear();
-                        self.kmer_index = self.max_index;
-                        return Some(Ok(0));
-                    }
-                    let curr_idx = self.translate_iter_step;
-                    self.translate_iter_step += 1;
-                    Some(Ok(self.hashes_buffer[curr_idx]))
+                    Ok(hash)
                 }
-            } else {
-                // Processing protein
-                // The kmer size is already divided by 3
-
-                if self.hash_function.protein() {
-                    let aa_kmer = &self.sequence[self.kmer_index..self.kmer_index + self.k_size];
-                    let hash = crate::_hash_murmur(aa_kmer, self.seed);
-                    self.kmer_index += 1;
-                    Some(Ok(hash))
-                } else {
-                    if !self.prot_configured {
-                        self.aa_seq = match &self.hash_function {
-                            HashFunctions::Murmur64Dayhoff => {
-                                self.sequence.iter().cloned().map(aa_to_dayhoff).collect()
-                            }
-                            HashFunctions::Murmur64Hp => {
-                                self.sequence.iter().cloned().map(aa_to_hp).collect()
-                            }
-                            invalid => {
-                                return Some(Err(Error::InvalidHashFunction {
-                                    function: format!("{}", invalid),
-                                }));
-                            }
-                        };
-                    }
-
-                    let aa_kmer = &self.aa_seq[self.kmer_index..self.kmer_index + self.k_size];
-                    let hash = crate::_hash_murmur(aa_kmer, self.seed);
-                    self.kmer_index += 1;
-                    Some(Ok(hash))
+                ReadingFrame::Protein { .. } => {
+                    let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size];
+                    Ok(crate::_hash_murmur(kmer, self.seed))
                 }
-            }
-        } else {
-            // End the iterator
-            None
+            };
+
+            self.kmer_index += 1; // Advance k-mer index for valid k-mers
+            return Some(result);
         }
+        None // No more frames or k-mers
     }
 }
 
@@ -973,6 +994,7 @@ impl TryInto<KmerMinHash> for Signature {
 
 #[cfg(test)]
 mod test {
+
     use std::fs::File;
     use std::io::{BufReader, Read};
     use std::path::PathBuf;
@@ -1103,12 +1125,12 @@ mod test {
     }
 
     #[test]
-    fn signature_skipmer_add_sequence() {
+    fn signature_skipm2n3_add_sequence() {
         let params = ComputeParameters::builder()
             .ksizes(vec![3, 4, 5, 6])
             .num_hashes(3u32)
             .dna(false)
-            .skipmer(true)
+            .skipm2n3(true)
             .build();
 
         let mut sig = Signature::from_params(&params);
@@ -1118,18 +1140,53 @@ mod test {
         dbg!(&sig.signatures);
         assert_eq!(sig.signatures[0].size(), 3);
         assert_eq!(sig.signatures[1].size(), 3);
-        assert_eq!(sig.signatures[2].size(), 2);
+        eprintln!("{:?}", sig.signatures[2]);
+        assert_eq!(sig.signatures[2].size(), 3);
         assert_eq!(sig.signatures[3].size(), 1);
     }
 
+    #[test]
+    fn signature_skipm1n3_add_sequence() {
+        let params = ComputeParameters::builder()
+            .ksizes(vec![3, 4, 5, 6])
+            .num_hashes(10u32)
+            .dna(false)
+            .skipm1n3(true)
+            .build();
+
+        let mut sig = Signature::from_params(&params);
+        sig.add_sequence(b"ATGCATGAATGAC", false).unwrap();
+
+        assert_eq!(sig.signatures.len(), 4);
+        dbg!(&sig.signatures);
+        assert_eq!(sig.signatures[0].size(), 5);
+        assert_eq!(sig.signatures[1].size(), 4);
+        assert_eq!(sig.signatures[2].size(), 1);
+        assert_eq!(sig.signatures[3].size(), 0);
+    }
+
     #[test]
     #[should_panic(expected = "not implemented")]
-    fn signature_skipmer_add_sequence_too_small() {
+    fn signature_skipm2n3_add_sequence_too_small() {
         let params = ComputeParameters::builder()
             .ksizes(vec![2])
-            .num_hashes(3u32)
+            .num_hashes(10u32)
             .dna(false)
-            .skipmer(true)
+            .skipm2n3(true)
+            .build();
+
+        let mut sig = Signature::from_params(&params);
+        sig.add_sequence(b"ATGCATGA", false).unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "not implemented")]
+    fn signature_skipm1n3_add_sequence_too_small() {
+        let params = ComputeParameters::builder()
+            .ksizes(vec![2])
+            .num_hashes(10u32)
+            .dna(false)
+            .skipm1n3(true)
             .build();
 
         let mut sig = Signature::from_params(&params);
@@ -1375,19 +1432,213 @@ mod test {
         }
     }
 
+    #[test]
+    fn test_seqtohashes_frames_dna() {
+        let sequence = b"AGTCGT";
+        let hash_function = HashFunctions::Murmur64Dna;
+        let k_size = 3;
+        let seed = 42;
+        let force = false;
+        let is_protein = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames.clone();
+
+        assert_eq!(frames.len(), 1);
+        assert_eq!(frames[0].fw(), sequence.as_slice());
+        assert_eq!(frames[0].rc(), b"ACGACT".as_slice());
+    }
+
+    #[test]
+    fn test_seqtohashes_frames_is_protein() {
+        let sequence = b"MVLSPADKTNVKAAW";
+        let hash_function = HashFunctions::Murmur64Protein;
+        let k_size = 3;
+        let seed = 42;
+        let force = false;
+        let is_protein = true;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames.clone();
+
+        assert_eq!(frames.len(), 1);
+        assert_eq!(frames[0].fw(), sequence.as_slice());
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_seqtohashes_frames_is_protein_try_access_rc() {
+        // test panic if trying to access rc
+        let sequence = b"MVLSPADKTNVKAAW";
+        let hash_function = HashFunctions::Murmur64Protein;
+        let k_size = 3;
+        let seed = 42;
+        let force = false;
+        let is_protein = true;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames.clone();
+
+        // protein frame doesn't have rc; this should panic
+        eprintln!("{:?}", frames[0].rc());
+    }
+
+    #[test]
+    fn test_seqtohashes_frames_is_protein_dayhoff() {
+        let sequence = b"MVLSPADKTNVKAAW";
+        let dayhoff_seq = b"eeebbbcdbcedbbf";
+        let hash_function = HashFunctions::Murmur64Dayhoff;
+        let k_size = 3;
+        let seed = 42;
+        let force = false;
+        let is_protein = true;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames.clone();
+
+        assert_eq!(frames.len(), 1);
+        assert_eq!(frames[0].fw(), dayhoff_seq.as_slice());
+    }
+
+    #[test]
+    fn test_seqtohashes_frames_is_protein_hp() {
+        let sequence = b"MVLSPADKTNVKAAW";
+        let hp_seq = b"hhhphhpppphphhh";
+        let hash_function = HashFunctions::Murmur64Hp;
+        let k_size = 3;
+        let seed = 42;
+        let force = false;
+        let is_protein = true;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames.clone();
+
+        assert_eq!(frames.len(), 1);
+        assert_eq!(frames[0].fw(), hp_seq.as_slice());
+    }
+
+    #[test]
+    fn test_seqtohashes_frames_translate_protein() {
+        let sequence = b"AGTCGTCGAGCT";
+        let hash_function = HashFunctions::Murmur64Protein;
+        let k_size = 3;
+        let seed = 42;
+        let force = false;
+        let is_protein = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames.clone();
+
+        assert_eq!(frames[0].fw(), b"SRRA".as_slice());
+        assert_eq!(frames[1].fw(), b"SSTT".as_slice());
+        assert_eq!(frames[2].fw(), b"VVE".as_slice());
+        assert_eq!(frames[3].fw(), b"ARR".as_slice());
+        assert_eq!(frames[4].fw(), b"SSS".as_slice());
+        assert_eq!(frames[5].fw(), b"LDD".as_slice());
+    }
+
+    #[test]
+    fn test_seqtohashes_frames_skipmer_m1n3() {
+        let sequence = b"AGTCGTCGAGCT";
+        let hash_function = HashFunctions::Murmur64Skipm1n3; // Represents m=1, n=3
+        let k_size = 3; // K-mer size is not directly relevant for skipmer frame validation
+        let seed = 42; // Seed is also irrelevant for frame structure
+        let force = false;
+        let is_protein = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames.clone();
+
+        eprintln!("Frames: {:?}", frames);
+
+        assert_eq!(frames.len(), 3); // Three skipmer frames
+
+        // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2)
+        assert_eq!(frames[0].fw(), b"ACCG".as_slice());
+        assert_eq!(frames[0].rc(), b"CGGT".as_slice());
+
+        assert_eq!(frames[1].fw(), b"GGGC".as_slice());
+        assert_eq!(frames[1].rc(), b"GCCC".as_slice());
+
+        assert_eq!(frames[2].fw(), b"TTAT".as_slice());
+        assert_eq!(frames[2].rc(), b"ATAA".as_slice());
+    }
+
+    #[test]
+    fn test_seqtohashes_frames_skipmer_m2n3() {
+        let sequence = b"AGTCGTCGAGCT";
+        let hash_function = HashFunctions::Murmur64Skipm2n3;
+        let k_size = 3;
+        let seed = 42;
+        let force = false;
+        let is_protein = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames;
+        eprintln!("Frames: {:?}", frames);
+
+        assert_eq!(frames.len(), 3); // Three skipmer frames
+
+        // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2)
+        assert_eq!(frames[0].fw(), b"AGCGCGGC".as_slice());
+        assert_eq!(frames[0].rc(), b"GCCGCGCT".as_slice());
+
+        assert_eq!(frames[1].fw(), b"GTGTGACT".as_slice());
+        assert_eq!(frames[1].rc(), b"AGTCACAC".as_slice());
+
+        assert_eq!(frames[2].fw(), b"TCTCAGT".as_slice());
+        assert_eq!(frames[2].rc(), b"ACTGAGA".as_slice());
+    }
+
     #[test]
     fn test_seqtohashes_dna() {
+        let sequence = b"AGTCGT";
+        let hash_function = HashFunctions::Murmur64Dna;
+        let k_size = 3;
+        let seed = 42;
+        let force = false;
+        let is_protein = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+
+        // Expected k-mers from the forward and reverse complement sequence
+        let expected_kmers = vec![
+            (b"AGT".to_vec(), b"ACT".to_vec()),
+            (b"GTC".to_vec(), b"GAC".to_vec()),
+            (b"TCG".to_vec(), b"CGA".to_vec()),
+            (b"CGT".to_vec(), b"ACG".to_vec()),
+        ];
+
+        // Compute expected hashes from expected kmers
+        let expected_hashes: Vec<u64> = expected_kmers
+            .iter()
+            .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed))
+            .collect();
+
+        // Collect hashes from SeqToHashes
+        let sth_hashes: Vec<u64> = sth.map(|result| result.unwrap()).collect();
+        eprintln!("SeqToHashes hashes: {:?}", sth_hashes);
+
+        // Check that SeqToHashes matches expected hashes in order
+        assert_eq!(
+            sth_hashes, expected_hashes,
+            "Hashes do not match in order for SeqToHashes"
+        );
+    }
+
+    #[test]
+    fn test_seqtohashes_dna_2() {
         let sequence = b"AGTCGTCA";
         let k_size = 7;
         let seed = 42;
         let force = true; // Force skip over invalid bases if needed
-
+        let is_protein = false;
         // Initialize SeqToHashes iterator using the new constructor
         let mut seq_to_hashes = SeqToHashes::new(
             sequence,
             k_size,
             force,
-            false,
+            is_protein,
             HashFunctions::Murmur64Dna,
             seed,
         );
@@ -1414,42 +1665,195 @@ mod test {
     }
 
     #[test]
-    fn test_seqtohashes_skipmer() {
-        let sequence = b"AGTCGTCA";
-        // let rc_seq = b"TGACGACT";
-        let k_size = 5;
+    fn test_seqtohashes_is_protein() {
+        let sequence = b"MVLSPADKTNVKAAW";
+        let hash_function = HashFunctions::Murmur64Protein;
+        let k_size = 3;
         let seed = 42;
-        let force = true; // Force skip over invalid bases if needed
+        let force = false;
+        let is_protein = true;
+
+        let sth = SeqToHashes::new(sequence, k_size * 3, force, is_protein, hash_function, seed);
+
+        // Expected k-mers for protein sequence
+        let expected_kmers = vec![
+            b"MVL".to_vec(),
+            b"VLS".to_vec(),
+            b"LSP".to_vec(),
+            b"SPA".to_vec(),
+            b"PAD".to_vec(),
+            b"ADK".to_vec(),
+            b"DKT".to_vec(),
+            b"KTN".to_vec(),
+            b"TNV".to_vec(),
+            b"NVK".to_vec(),
+            b"VKA".to_vec(),
+            b"KAA".to_vec(),
+            b"AAW".to_vec(),
+        ];
+
+        // Compute hashes for expected k-mers
+        let expected_hashes: Vec<u64> = expected_kmers
+            .iter()
+            .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42))
+            .collect();
 
-        // Initialize SeqToHashes iterator using the new constructor
-        let mut seq_to_hashes = SeqToHashes::new(
-            sequence,
-            k_size,
-            force,
-            false,
-            HashFunctions::Murmur64Skipmer,
-            seed,
+        // Collect hashes from SeqToHashes
+        let sth_hashes: Vec<u64> = sth.map(|result| result.unwrap()).collect();
+        eprintln!("SeqToHashes hashes: {:?}", sth_hashes);
+
+        // Check that SeqToHashes matches expected hashes in order
+        assert_eq!(sth_hashes, expected_hashes, "Hashes do not match in order");
+    }
+
+    #[test]
+    fn test_seqtohashes_translate() {
+        let sequence = b"AGTCGTCGAGCT";
+        let hash_function = HashFunctions::Murmur64Protein;
+        let k_size = 9; // needs to be *3 for protein
+        let seed = 42;
+        let force = false;
+        let is_protein = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+
+        let expected_kmers = vec![
+            b"SRR".as_slice(),
+            b"RRA".as_slice(),
+            b"SST".as_slice(),
+            b"STT".as_slice(),
+            b"VVE".as_slice(),
+            b"ARR".as_slice(),
+            b"SSS".as_slice(),
+            b"LDD".as_slice(),
+        ];
+
+        // Compute expected hashes
+        let expected_hashes: Vec<u64> = expected_kmers
+            .iter()
+            .map(|fw_kmer| crate::_hash_murmur(fw_kmer, seed))
+            .collect();
+
+        // Collect hashes from SeqToHashes
+        let sth_hashes: Vec<u64> = sth.map(|result| result.unwrap()).collect();
+        eprintln!("SeqToHashes hashes: {:?}", sth_hashes);
+
+        // Check that SeqToHashes matches expected hashes in order
+        assert_eq!(
+            sth_hashes, expected_hashes,
+            "Hashes do not match in order for SeqToHashes"
         );
+    }
+
+    #[test]
+    fn test_seqtohashes_skipm1n3() {
+        let sequence = b"AGTCGTCGAGCT";
+        let hash_function = HashFunctions::Murmur64Skipm1n3;
+        let k_size = 3;
+        let is_protein = false;
+        let seed = 42;
+        let force = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        // Expected k-mers for skipmer (m=1, n=3) across all frames
+        let expected_kmers = vec![
+            (b"ACC".as_slice(), b"GGT".as_slice()),
+            (b"CCG".as_slice(), b"CGG".as_slice()),
+            (b"GGG".as_slice(), b"CCC".as_slice()),
+            (b"GGC".as_slice(), b"GCC".as_slice()),
+            (b"TTA".as_slice(), b"TAA".as_slice()),
+            (b"TAT".as_slice(), b"ATA".as_slice()),
+        ];
+
+        // Compute expected hashes
+        let expected_hashes: Vec<u64> = expected_kmers
+            .iter()
+            .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed))
+            .collect();
 
-        // Define expected hashes for the skipmer configuration.
-        let expected_kmers = ["AGCGC", "GTGTA"];
-        // rc of the k-mer, not of the sequence, then skipmerized. Correct?
-        let expected_krc = ["GCGCT", "TACAC"];
+        // Collect hashes from SeqToHashes
+        let sth_hashes: Vec<u64> = sth.map(|result| result.unwrap()).collect();
+        eprintln!("SeqToHashes hashes: {:?}", sth_hashes);
 
-        // Compute expected hashes by hashing each k-mer with its reverse complement
+        // Check that SeqToHashes matches expected hashes in order
+        assert_eq!(
+            sth_hashes, expected_hashes,
+            "Hashes do not match in order for SeqToHashes"
+        );
+    }
+
+    #[test]
+    fn test_seq2hashes_skipm2n3() {
+        let sequence = b"AGTCGTCGAGCT";
+        let hash_function = HashFunctions::Murmur64Skipm2n3;
+        let k_size = 7;
+        let is_protein = false;
+        let seed = 42;
+        let force = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+
+        // Expected k-mers for skipmer (m=2, n=3)
+        let expected_kmers = vec![
+            (b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()),
+            (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice()),
+            (b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()),
+            (b"TGTGACT".as_slice(), b"AGTCACA".as_slice()),
+            (b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice()),
+        ];
+
+        // Compute expected hashes
         let expected_hashes: Vec<u64> = expected_kmers
             .iter()
-            .zip(expected_krc.iter())
-            .map(|(kmer, krc)| {
-                // Convert both kmer and krc to byte slices and pass to _hash_murmur
-                crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed)
-            })
+            .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed))
             .collect();
 
-        // Compare each produced hash from the iterator with the expected hash
-        for expected_hash in expected_hashes {
-            let hash = seq_to_hashes.next().unwrap().ok().unwrap();
-            assert_eq!(hash, expected_hash, "Mismatch in skipmer hash");
+        // Collect hashes from SeqToHashes
+        let sth_hashes: Vec<u64> = sth.map(|result| result.unwrap()).collect();
+        eprintln!("SeqToHashes hashes: {:?}", sth_hashes);
+
+        // Check that SeqToHashes matches expected hashes in order
+        assert_eq!(
+            sth_hashes, expected_hashes,
+            "Hashes do not match in order for SeqToHashes"
+        );
+    }
+
+    #[test]
+    fn test_seqtohashes_skipm2n3_2() {
+        let sequence = b"AGTCGTCA";
+        let hash_function = HashFunctions::Murmur64Skipm2n3;
+        let k_size = 5;
+        let seed = 42;
+        let force = true;
+        let is_protein = false;
+
+        let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed);
+        let frames = sth.frames.clone();
+        for fr in frames {
+            eprintln!("{}", fr);
         }
+
+        let expected_kmers = vec![
+            (b"AGCGC".as_slice(), b"GCGCT".as_slice()),
+            (b"GCGCA".as_slice(), b"TGCGC".as_slice()),
+            (b"GTGTA".as_slice(), b"TACAC".as_slice()),
+        ];
+
+        // Compute expected hashes
+        let expected_hashes: Vec<u64> = expected_kmers
+            .iter()
+            .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed))
+            .collect();
+
+        // Collect hashes from SeqToHashes
+        let sth_hashes: Vec<u64> = sth.map(|result| result.unwrap()).collect();
+        eprintln!("SeqToHashes hashes: {:?}", sth_hashes);
+
+        // Check that SeqToHashes matches expected hashes in order
+        assert_eq!(
+            sth_hashes, expected_hashes,
+            "Hashes do not match in order for SeqToHashes"
+        );
     }
 }
diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs
index 3c8f3b4240..d8ca2337a0 100644
--- a/src/core/src/sketch/minhash.rs
+++ b/src/core/src/sketch/minhash.rs
@@ -153,7 +153,8 @@ impl<'de> Deserialize<'de> for KmerMinHash {
             "dayhoff" => HashFunctions::Murmur64Dayhoff,
             "hp" => HashFunctions::Murmur64Hp,
             "dna" => HashFunctions::Murmur64Dna,
-            "skipmer" => HashFunctions::Murmur64Skipmer,
+            "skipm1n3" => HashFunctions::Murmur64Skipm1n3,
+            "skipm2n3" => HashFunctions::Murmur64Skipm2n3,
             _ => unimplemented!(), // TODO: throw error here
         };
 
diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py
index 88332b1945..a9af0fdd24 100644
--- a/tests/test_sourmash_sketch.py
+++ b/tests/test_sourmash_sketch.py
@@ -345,6 +345,7 @@ def test_protein_override_bad_rust_foo():
     siglist = factory()
     assert len(siglist) == 1
     sig = siglist[0]
+    print(sig.minhash.ksize)
 
     # try adding something
     testdata1 = utils.get_test_data("ecoli.faa")
@@ -354,7 +355,7 @@ def test_protein_override_bad_rust_foo():
     with pytest.raises(ValueError) as exc:
         sig.add_protein(record.sequence)
 
-    assert 'Invalid hash function: "DNA"' in str(exc)
+    assert "invalid DNA character in input k-mer: MRVLKFGGTS" in str(exc)
 
 
 def test_dayhoff_defaults():

From 6359e165cc02fe72b99a02c0fe03a8afc055cf81 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 12 Dec 2024 14:58:59 -0800
Subject: [PATCH 09/22] try fix branchwater incompat

---
 src/core/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml
index 41cf944fac..469bb2bbe7 100644
--- a/src/core/Cargo.toml
+++ b/src/core/Cargo.toml
@@ -96,7 +96,7 @@ skip_feature_sets = [
 ## Wasm section. Crates only used for WASM, as well as specific configurations
 
 [target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies]
-js-sys = "0.3.74"
+js-sys = "0.3.72"
 web-sys = { version = "0.3.74", features = ["console", "File", "FileReaderSync"] }
 wasm-bindgen = "0.2.89"
 getrandom = { version = "0.2", features = ["js"] }

From 43e069c7cb7217bb8c1c812ad70c4db301015c46 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 12 Dec 2024 15:00:54 -0800
Subject: [PATCH 10/22] roll back web-sys too

---
 src/core/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml
index 469bb2bbe7..006adae4d7 100644
--- a/src/core/Cargo.toml
+++ b/src/core/Cargo.toml
@@ -97,7 +97,7 @@ skip_feature_sets = [
 
 [target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies]
 js-sys = "0.3.72"
-web-sys = { version = "0.3.74", features = ["console", "File", "FileReaderSync"] }
+web-sys = { version = "0.3.72", features = ["console", "File", "FileReaderSync"] }
 wasm-bindgen = "0.2.89"
 getrandom = { version = "0.2", features = ["js"] }
 

From 600864663c36d4a90d407aac6260ab1e1467be59 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 12 Dec 2024 15:04:57 -0800
Subject: [PATCH 11/22] add js-sys, web-sys to dependabot ignore

---
 .github/dependabot.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index d01a55a915..50a37271e5 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -17,6 +17,8 @@ updates:
     - dependency-name: "wasm-bindgen"
     - dependency-name: "once_cell"
     - dependency-name: "chrono"
+    - dependency-name: "js-sys"
+    - dependency-name: "web-sys"
 - package-ecosystem: "github-actions"
   directory: "/"
   schedule:

From b20479c3f83f0015e227c4e6fad814014bdcbc31 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 16 Dec 2024 11:44:33 -0800
Subject: [PATCH 12/22] add more tests

---
 src/core/src/encodings.rs | 91 +++++++++++++++++++++++++++++++++++++++
 src/core/src/signature.rs | 34 ++++++++++++++-
 2 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs
index 56077c80e1..06af5d03c1 100644
--- a/src/core/src/encodings.rs
+++ b/src/core/src/encodings.rs
@@ -520,6 +520,7 @@ impl<'a> Iterator for Indices<'a> {
 #[cfg(test)]
 mod test {
     use super::*;
+    use std::convert::TryFrom;
 
     #[test]
     fn colors_update() {
@@ -587,4 +588,94 @@ mod test {
 
         assert_eq!(colors.len(), 2);
     }
+
+    #[test]
+    fn test_dna_method() {
+        assert!(HashFunctions::Murmur64Dna.dna());
+        assert!(!HashFunctions::Murmur64Protein.dna());
+        assert!(!HashFunctions::Murmur64Dayhoff.dna());
+    }
+
+    #[test]
+    fn test_protein_method() {
+        assert!(HashFunctions::Murmur64Protein.protein());
+        assert!(!HashFunctions::Murmur64Dna.protein());
+        assert!(!HashFunctions::Murmur64Dayhoff.protein());
+    }
+
+    #[test]
+    fn test_dayhoff_method() {
+        assert!(HashFunctions::Murmur64Dayhoff.dayhoff());
+        assert!(!HashFunctions::Murmur64Dna.dayhoff());
+        assert!(!HashFunctions::Murmur64Protein.dayhoff());
+    }
+
+    #[test]
+    fn test_hp_method() {
+        assert!(HashFunctions::Murmur64Hp.hp());
+        assert!(!HashFunctions::Murmur64Dna.hp());
+        assert!(!HashFunctions::Murmur64Protein.hp());
+    }
+
+    #[test]
+    fn test_skipm1n3_method() {
+        assert!(HashFunctions::Murmur64Skipm1n3.skipm1n3());
+        assert!(!HashFunctions::Murmur64Dna.skipm1n3());
+        assert!(!HashFunctions::Murmur64Protein.skipm1n3());
+    }
+
+    #[test]
+    fn test_skipm2n3_method() {
+        assert!(HashFunctions::Murmur64Skipm2n3.skipm2n3());
+        assert!(!HashFunctions::Murmur64Dna.skipm2n3());
+        assert!(!HashFunctions::Murmur64Protein.skipm2n3());
+    }
+
+    #[test]
+    fn test_display_hashfunctions() {
+        assert_eq!(HashFunctions::Murmur64Dna.to_string(), "DNA");
+        assert_eq!(HashFunctions::Murmur64Protein.to_string(), "protein");
+        assert_eq!(HashFunctions::Murmur64Dayhoff.to_string(), "dayhoff");
+        assert_eq!(HashFunctions::Murmur64Hp.to_string(), "hp");
+        assert_eq!(HashFunctions::Murmur64Skipm1n3.to_string(), "skipm1n3");
+        assert_eq!(HashFunctions::Murmur64Skipm2n3.to_string(), "skipm2n3");
+        assert_eq!(
+            HashFunctions::Custom("custom_string".into()).to_string(),
+            "custom_string"
+        );
+    }
+
+    #[test]
+    fn test_try_from_str_valid() {
+        assert_eq!(
+            HashFunctions::try_from("dna").unwrap(),
+            HashFunctions::Murmur64Dna
+        );
+        assert_eq!(
+            HashFunctions::try_from("protein").unwrap(),
+            HashFunctions::Murmur64Protein
+        );
+        assert_eq!(
+            HashFunctions::try_from("dayhoff").unwrap(),
+            HashFunctions::Murmur64Dayhoff
+        );
+        assert_eq!(
+            HashFunctions::try_from("hp").unwrap(),
+            HashFunctions::Murmur64Hp
+        );
+        assert_eq!(
+            HashFunctions::try_from("skipm1n3").unwrap(),
+            HashFunctions::Murmur64Skipm1n3
+        );
+        assert_eq!(
+            HashFunctions::try_from("skipm2n3").unwrap(),
+            HashFunctions::Murmur64Skipm2n3
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "not implemented: unknown")]
+    fn test_try_from_str_invalid() {
+        HashFunctions::try_from("unknown").unwrap();
+    }
 }
diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index 5cc60ca299..7fdb9b7f2d 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -995,6 +995,7 @@ impl TryInto<KmerMinHash> for Signature {
 #[cfg(test)]
 mod test {
 
+    use core::num;
     use std::fs::File;
     use std::io::{BufReader, Read};
     use std::path::PathBuf;
@@ -1003,8 +1004,7 @@ mod test {
 
     use crate::cmd::ComputeParameters;
     use crate::encodings::HashFunctions;
-    use crate::signature::SeqToHashes;
-    use crate::signature::SigsTrait;
+    use crate::signature::{ReadingFrame, SeqToHashes, SigsTrait};
 
     use super::Signature;
 
@@ -1432,6 +1432,15 @@ mod test {
         }
     }
 
+    #[test]
+    fn test_readingframe_dna() {
+        let sequence = b"AGTCGT";
+        let frame = ReadingFrame::new_dna(sequence);
+
+        assert_eq!(frame.fw(), sequence.as_slice());
+        assert_eq!(frame.rc(), b"ACGACT".as_slice());
+    }
+
     #[test]
     fn test_seqtohashes_frames_dna() {
         let sequence = b"AGTCGT";
@@ -1465,6 +1474,16 @@ mod test {
         assert_eq!(frames[0].fw(), sequence.as_slice());
     }
 
+    #[test]
+    fn test_readingframe_protein() {
+        let sequence = b"MVLSPADKTNVKAAW";
+        let hash_function = HashFunctions::Murmur64Protein;
+        let frame =
+            ReadingFrame::new_protein(sequence, hash_function.dayhoff(), hash_function.hp());
+
+        assert_eq!(frame.fw(), sequence.as_slice());
+    }
+
     #[test]
     #[should_panic]
     fn test_seqtohashes_frames_is_protein_try_access_rc() {
@@ -1537,6 +1556,17 @@ mod test {
         assert_eq!(frames[5].fw(), b"LDD".as_slice());
     }
 
+    #[test]
+    #[should_panic(expected = "Skipmer frame number must be < n")]
+    fn test_readingframe_skipmer() {
+        let sequence = b"AGTCGT";
+        let m = 2;
+        let n = 3;
+        let num_frames = 4; // four frames but n is only 3
+
+        ReadingFrame::new_skipmer(sequence, num_frames, m, n);
+    }
+
     #[test]
     fn test_seqtohashes_frames_skipmer_m1n3() {
         let sequence = b"AGTCGTCGAGCT";

From e5ea8af3a6734ab1b67a74ae951cd3cc7976b1ff Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 16 Dec 2024 11:55:51 -0800
Subject: [PATCH 13/22] more tests

---
 src/core/src/signature.rs | 87 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index 7fdb9b7f2d..f7f3836fc2 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -995,7 +995,6 @@ impl TryInto<KmerMinHash> for Signature {
 #[cfg(test)]
 mod test {
 
-    use core::num;
     use std::fs::File;
     use std::io::{BufReader, Read};
     use std::path::PathBuf;
@@ -1441,6 +1440,83 @@ mod test {
         assert_eq!(frame.rc(), b"ACGACT".as_slice());
     }
 
+    #[test]
+    fn test_fw_dna() {
+        let dna_frame = ReadingFrame::DNA {
+            fw: b"ATCG".to_vec(),
+            rc: b"CGAT".to_vec(),
+            len: 4,
+        };
+        assert_eq!(dna_frame.fw(), b"ATCG");
+    }
+
+    #[test]
+    fn test_rc_dna() {
+        let dna_frame = ReadingFrame::DNA {
+            fw: b"ATCG".to_vec(),
+            rc: b"CGAT".to_vec(),
+            len: 4,
+        };
+        assert_eq!(dna_frame.rc(), b"CGAT");
+    }
+
+    #[test]
+    fn test_length_dna() {
+        let dna_frame = ReadingFrame::DNA {
+            fw: b"ATCG".to_vec(),
+            rc: b"CGAT".to_vec(),
+            len: 4,
+        };
+        assert_eq!(dna_frame.length(), 4);
+    }
+
+    #[test]
+    fn test_frame_type_dna() {
+        let dna_frame = ReadingFrame::DNA {
+            fw: b"ATCG".to_vec(),
+            rc: b"CGAT".to_vec(),
+            len: 4,
+        };
+        assert_eq!(dna_frame.frame_type(), "DNA");
+    }
+
+    #[test]
+    fn test_fw_protein() {
+        let protein_frame = ReadingFrame::Protein {
+            fw: b"MVHL".to_vec(),
+            len: 4,
+        };
+        assert_eq!(protein_frame.fw(), b"MVHL");
+    }
+
+    #[test]
+    #[should_panic(expected = "Reverse complement is only available for DNA frames")]
+    fn test_rc_protein_panics() {
+        let protein_frame = ReadingFrame::Protein {
+            fw: b"MVHL".to_vec(),
+            len: 4,
+        };
+        protein_frame.rc();
+    }
+
+    #[test]
+    fn test_length_protein() {
+        let protein_frame = ReadingFrame::Protein {
+            fw: b"MVHL".to_vec(),
+            len: 4,
+        };
+        assert_eq!(protein_frame.length(), 4);
+    }
+
+    #[test]
+    fn test_frame_type_protein() {
+        let protein_frame = ReadingFrame::Protein {
+            fw: b"MVHL".to_vec(),
+            len: 4,
+        };
+        assert_eq!(protein_frame.frame_type(), "Protein");
+    }
+
     #[test]
     fn test_seqtohashes_frames_dna() {
         let sequence = b"AGTCGT";
@@ -1556,6 +1632,15 @@ mod test {
         assert_eq!(frames[5].fw(), b"LDD".as_slice());
     }
 
+    #[test]
+    #[should_panic(expected = "Frame number must be 0, 1, or 2")]
+    fn test_readingframe_translate() {
+        let sequence = b"AGTCGT";
+        let frame_start = 3; // four frames but translate can only
+
+        ReadingFrame::new_translated(sequence, frame_start, false, false);
+    }
+
     #[test]
     #[should_panic(expected = "Skipmer frame number must be < n")]
     fn test_readingframe_skipmer() {

From 8a0600d79d41fe34e53a860dc62d89537087eac3 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 16 Dec 2024 12:43:42 -0800
Subject: [PATCH 14/22] test protein rf display

---
 src/core/src/signature.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index f7f3836fc2..743b5f3e57 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -1517,6 +1517,19 @@ mod test {
         assert_eq!(protein_frame.frame_type(), "Protein");
     }
 
+    #[test]
+    fn test_readingframe_display_protein() {
+        // Create a Protein ReadingFrame
+        let protein_frame = ReadingFrame::Protein {
+            fw: b"MVHLK".to_vec(),
+            len: 5,
+        };
+
+        let output = format!("{}", protein_frame);
+        // Assert the output matches the expected format
+        assert_eq!(output, "Type: Protein (5aa), Forward: MVHLK");
+    }
+
     #[test]
     fn test_seqtohashes_frames_dna() {
         let sequence = b"AGTCGT";

From e2d6c90e622522bd43051277f9ebda826b84fb8e Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 19 Dec 2024 13:38:41 -0800
Subject: [PATCH 15/22] init py skipmers

---
 src/core/src/ffi/mod.rs        | 10 ++++++++--
 src/sourmash/index/__init__.py |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs
index 6f1dff78e4..2a72fd549e 100644
--- a/src/core/src/ffi/mod.rs
+++ b/src/core/src/ffi/mod.rs
@@ -36,18 +36,22 @@ pub enum HashFunctions {
     Murmur64Protein = 2,
     Murmur64Dayhoff = 3,
     Murmur64Hp = 4,
+    Murmur64Skipm1n3 = 5,
+    Murmur64Skipm2n3 = 6,
 }
 
 impl From<HashFunctions> for crate::encodings::HashFunctions {
     fn from(v: HashFunctions) -> crate::encodings::HashFunctions {
         use crate::encodings::HashFunctions::{
-            Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein,
+            Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, Murmur64Skipm2n3,
         };
         match v {
             HashFunctions::Murmur64Dna => Murmur64Dna,
             HashFunctions::Murmur64Protein => Murmur64Protein,
             HashFunctions::Murmur64Dayhoff => Murmur64Dayhoff,
             HashFunctions::Murmur64Hp => Murmur64Hp,
+            HashFunctions::Murmur64Skipm1n3 => Murmur64Skipm1n3,
+            HashFunctions::Murmur64Skipm2n3 => Murmur64Skipm2n3,
         }
     }
 }
@@ -55,13 +59,15 @@ impl From<HashFunctions> for crate::encodings::HashFunctions {
 impl From<crate::encodings::HashFunctions> for HashFunctions {
     fn from(v: crate::encodings::HashFunctions) -> HashFunctions {
         use crate::encodings::HashFunctions::{
-            Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein,
+            Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, Murmur64Skipm2n3,
         };
         match v {
             Murmur64Dna => HashFunctions::Murmur64Dna,
             Murmur64Protein => HashFunctions::Murmur64Protein,
             Murmur64Dayhoff => HashFunctions::Murmur64Dayhoff,
             Murmur64Hp => HashFunctions::Murmur64Hp,
+            Murmur64Skipm1n3 => HashFunctions::Murmur64Skipm1n3,
+            Murmur64Skipm2n3 => HashFunctions::Murmur64Skipm2n3,
             _ => todo!("Not supported, probably custom"),
         }
     }
diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py
index a8a638653d..b04952d873 100644
--- a/src/sourmash/index/__init__.py
+++ b/src/sourmash/index/__init__.py
@@ -1242,7 +1242,7 @@ def _check_select_parameters(**kw):
 
     moltype = kw.get("moltype")
     if moltype is not None:
-        if moltype not in ["DNA", "protein", "dayhoff", "hp"]:
+        if moltype not in ["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"]:
             raise ValueError(f"unknown moltype: {moltype}")
 
     scaled = kw.get("scaled")

From f374b81353e538a89e5ca3c7ff6027638c5d3d7d Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 19 Dec 2024 15:20:13 -0800
Subject: [PATCH 16/22] init ffi py skipmers

---
 src/core/src/ffi/minhash.rs    | 12 +++++++++++
 src/core/src/sketch/minhash.rs |  8 +++++++
 src/sourmash/cli/utils.py      | 33 +++++++++++++++++++++++++++++
 src/sourmash/minhash.py        | 38 +++++++++++++++++++++++++++++++++-
 src/sourmash/sourmash_args.py  | 12 +++++++++--
 5 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs
index 55879dd060..a6fda53d1c 100644
--- a/src/core/src/ffi/minhash.rs
+++ b/src/core/src/ffi/minhash.rs
@@ -320,6 +320,18 @@ pub unsafe extern "C" fn kmerminhash_hp(ptr: *const SourmashKmerMinHash) -> bool
     mh.hp()
 }
 
+#[no_mangle]
+pub unsafe extern "C" fn kmerminhash_skipm1n3(ptr: *const SourmashKmerMinHash) -> bool {
+    let mh = SourmashKmerMinHash::as_rust(ptr);
+    mh.skipm1n3()
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn kmerminhash_skipm2n3(ptr: *const SourmashKmerMinHash) -> bool {
+    let mh = SourmashKmerMinHash::as_rust(ptr);
+    mh.skipm2n3()
+}
+
 #[no_mangle]
 pub unsafe extern "C" fn kmerminhash_seed(ptr: *const SourmashKmerMinHash) -> u64 {
     let mh = SourmashKmerMinHash::as_rust(ptr);
diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs
index d8ca2337a0..9cd9f78b9b 100644
--- a/src/core/src/sketch/minhash.rs
+++ b/src/core/src/sketch/minhash.rs
@@ -711,6 +711,14 @@ impl KmerMinHash {
         self.hash_function == HashFunctions::Murmur64Hp
     }
 
+    pub fn skipm1n3(&self) -> bool {
+        self.hash_function == HashFunctions::Murmur64Skipm1n3
+    }
+
+    pub fn skipm2n3(&self) -> bool {
+        self.hash_function == HashFunctions::Murmur64Skipm2n3
+    }
+
     pub fn mins(&self) -> Vec<u64> {
         self.mins.clone()
     }
diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py
index 26da5ead5f..82a66dfe9b 100644
--- a/src/sourmash/cli/utils.py
+++ b/src/sourmash/cli/utils.py
@@ -50,6 +50,39 @@ def add_moltype_args(parser):
     )
     parser.set_defaults(hp=False)
 
+    parser.add_argument(
+            "--skipm1n3",
+            "--skipmer-m1n3",
+            dest="skipm1n3",
+            action="store_true",
+            help="choose skipmer (m1n3) signatures",
+        )
+
+    parser.add_argument(
+        "--no-skipm1n3",
+        "--no-skipmer-m1n3",
+        dest="skipm1n3",
+        action="store_false",
+        help="do not choose skipmer (m1n3) signatures",
+    )
+    parser.set_defaults(skipm1n3=False)
+
+    parser.add_argument(
+            "--skipm2n3",
+            "--skipmer-m2n3",
+            dest="skipm2n3",
+            action="store_true",
+            help="choose skipmer (m2n3) signatures",
+        )
+    parser.add_argument(
+        "--no-skipm2n3",
+        "--no-skipmer-m2n3",
+        dest="skipm2n3",
+        action="store_false",
+        help="do not choose skipmer (m2n3) signatures",
+    )
+    parser.set_defaults(skipm2n3=False)
+
     parser.add_argument(
         "--dna",
         "--rna",
diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
index 90d8244e2d..7188ed7427 100644
--- a/src/sourmash/minhash.py
+++ b/src/sourmash/minhash.py
@@ -196,6 +196,8 @@ def __init__(
         is_protein=False,
         dayhoff=False,
         hp=False,
+        skipm1n3=False,
+        skipm2n3=False,
         track_abundance=False,
         seed=MINHASH_DEFAULT_SEED,
         max_hash=0,
@@ -215,6 +217,8 @@ def __init__(
            * is_protein (default False) - aa k-mers
            * dayhoff (default False) - dayhoff encoding
            * hp (default False) - hydrophilic/hydrophobic aa
+           * skipm1n3 (default False) - skipmer (m1n3)
+           * skipm2n3 (default False) - skipmer (m2n3)
            * track_abundance (default False) - track hash multiplicity
            * mins (default None) - list of hashvals, or (hashval, abund) pairs
            * seed (default 42) - murmurhash seed
@@ -243,6 +247,10 @@ def __init__(
         elif is_protein:
             hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
             ksize = ksize * 3
+        elif skipm1n3:
+            hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3
+        elif skipm2n3:
+            hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3
         else:
             hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA
 
@@ -264,6 +272,8 @@ def __copy__(self):
             is_protein=self.is_protein,
             dayhoff=self.dayhoff,
             hp=self.hp,
+            skipm1n3=self.skipm1n3,
+            skipm2n3=self.skipm2n3,
             track_abundance=self.track_abundance,
             seed=self.seed,
             max_hash=self._max_hash,
@@ -285,6 +295,8 @@ def __getstate__(self):
             self.is_protein,
             self.dayhoff,
             self.hp,
+            self.skipm1n3,
+            self.skipm2n3,
             self.hashes,
             None,
             self.track_abundance,
@@ -300,6 +312,8 @@ def __setstate__(self, tup):
             is_protein,
             dayhoff,
             hp,
+            skipm1n3,
+            skipm2n3,
             mins,
             _,
             track_abundance,
@@ -316,6 +330,10 @@ def __setstate__(self, tup):
             if hp
             else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
             if is_protein
+            else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3
+            if skipm1n3
+            else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3
+            if skipm2n3
             else lib.HASH_FUNCTIONS_MURMUR64_DNA
         )
 
@@ -340,6 +358,8 @@ def copy_and_clear(self):
             is_protein=self.is_protein,
             dayhoff=self.dayhoff,
             hp=self.hp,
+            skipm1n3=self.skipm1n3,
+            skipm2n3=self.skipm2n3,
             track_abundance=self.track_abundance,
             seed=self.seed,
             max_hash=self._max_hash,
@@ -561,7 +581,9 @@ def scaled(self):
 
     @property
     def is_dna(self):
-        return not (self.is_protein or self.dayhoff or self.hp)
+        return not (
+            self.is_protein or self.dayhoff or self.hp or self.skipm1n3 or self.skipm2n3
+        )
 
     @property
     def is_protein(self):
@@ -575,6 +597,14 @@ def dayhoff(self):
     def hp(self):
         return self._methodcall(lib.kmerminhash_hp)
 
+    @property
+    def skipm1n3(self):
+        return self._methodcall(lib.kmerminhash_skipm1n3)
+
+    @property
+    def skipm2n3(self):
+        return self._methodcall(lib.kmerminhash_skipm2n3)
+
     @property
     def ksize(self):
         k = self._methodcall(lib.kmerminhash_ksize)
@@ -1224,6 +1254,8 @@ def __setstate__(self, tup):
             is_protein,
             dayhoff,
             hp,
+            skipm1n3,
+            skipm2n3,
             mins,
             _,
             track_abundance,
@@ -1240,6 +1272,10 @@ def __setstate__(self, tup):
             if hp
             else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
             if is_protein
+            else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3
+            if skipm1n3
+            else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3
+            if skipm2n3
             else lib.HASH_FUNCTIONS_MURMUR64_DNA
         )
 
diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py
index 9f2068160a..20028fa921 100644
--- a/src/sourmash/sourmash_args.py
+++ b/src/sourmash/sourmash_args.py
@@ -84,7 +84,7 @@ def check_num_bounds(arg):
 
 def get_moltype(sig, require=False):
     mh = sig.minhash
-    if mh.moltype in ("DNA", "dayhoff", "hp", "protein"):
+    if mh.moltype in ("DNA", "dayhoff", "hp", "protein", "skipm1n3", "skipm2n3"):
         moltype = mh.moltype
     else:
         raise ValueError(f"unknown molecule type for sig {sig}")
@@ -109,9 +109,15 @@ def calculate_moltype(args, default=None):
         moltype = "protein"
         n += 1
 
+    if args.skipm1n3:
+        moltype = "skipm1n3"
+
+    if args.skipm2n3:
+        moltype = "skipm2n3"
+
     if n > 1:
         error(
-            "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff"
+            "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff/--skipm1n3/--skipm2n3"
         )
         sys.exit(-1)
 
@@ -185,11 +191,13 @@ def load_include_exclude_db_patterns(args):
 
         def search_pattern(vals):
             return any(pattern.search(val) for val in vals)
+
     elif args.exclude_db_pattern:
         pattern = re.compile(args.exclude_db_pattern, re.IGNORECASE)
 
         def search_pattern(vals):
             return all(not pattern.search(val) for val in vals)
+
     else:
         search_pattern = None
 

From d66d4aff840a32f988795a7c1a9227fdf054b115 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 01:43:16 +0000
Subject: [PATCH 17/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/sourmash/cli/utils.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py
index 82a66dfe9b..467ed307fb 100644
--- a/src/sourmash/cli/utils.py
+++ b/src/sourmash/cli/utils.py
@@ -51,12 +51,12 @@ def add_moltype_args(parser):
     parser.set_defaults(hp=False)
 
     parser.add_argument(
-            "--skipm1n3",
-            "--skipmer-m1n3",
-            dest="skipm1n3",
-            action="store_true",
-            help="choose skipmer (m1n3) signatures",
-        )
+        "--skipm1n3",
+        "--skipmer-m1n3",
+        dest="skipm1n3",
+        action="store_true",
+        help="choose skipmer (m1n3) signatures",
+    )
 
     parser.add_argument(
         "--no-skipm1n3",
@@ -68,12 +68,12 @@ def add_moltype_args(parser):
     parser.set_defaults(skipm1n3=False)
 
     parser.add_argument(
-            "--skipm2n3",
-            "--skipmer-m2n3",
-            dest="skipm2n3",
-            action="store_true",
-            help="choose skipmer (m2n3) signatures",
-        )
+        "--skipm2n3",
+        "--skipmer-m2n3",
+        dest="skipm2n3",
+        action="store_true",
+        help="choose skipmer (m2n3) signatures",
+    )
     parser.add_argument(
         "--no-skipm2n3",
         "--no-skipmer-m2n3",

From 246a88fc43af6a1709aa8931c41c27599b12023c Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 20 Dec 2024 09:53:08 -0800
Subject: [PATCH 18/22] upd bindgen

---
 include/sourmash.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/sourmash.h b/include/sourmash.h
index f650035dec..33110912a5 100644
--- a/include/sourmash.h
+++ b/include/sourmash.h
@@ -13,6 +13,8 @@ enum HashFunctions {
   HASH_FUNCTIONS_MURMUR64_PROTEIN = 2,
   HASH_FUNCTIONS_MURMUR64_DAYHOFF = 3,
   HASH_FUNCTIONS_MURMUR64_HP = 4,
+  HASH_FUNCTIONS_MURMUR64_SKIPM1N3 = 5,
+  HASH_FUNCTIONS_MURMUR64_SKIPM2N3 = 6,
 };
 typedef uint32_t HashFunctions;
 
@@ -271,6 +273,10 @@ double kmerminhash_similarity(const SourmashKmerMinHash *ptr,
                               bool ignore_abundance,
                               bool downsample);
 
+bool kmerminhash_skipm1n3(const SourmashKmerMinHash *ptr);
+
+bool kmerminhash_skipm2n3(const SourmashKmerMinHash *ptr);
+
 void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize);
 
 bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr);

From 0c7ed1bcb612e082eb7a5c6ef744403f61b25657 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 20 Dec 2024 11:18:57 -0800
Subject: [PATCH 19/22] more moltype

---
 src/sourmash/minhash.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
index 7188ed7427..d7ed0a03d4 100644
--- a/src/sourmash/minhash.py
+++ b/src/sourmash/minhash.py
@@ -291,7 +291,7 @@ def __getstate__(self):
         # get a ksize that makes sense to the Rust layer. See #2262.
         return (
             self.num,
-            self.ksize if self.is_dna else self.ksize * 3,
+            self.ksize if self.is_dna or self.skipm1n3 or self.skipm2n3 else self.ksize * 3,
             self.is_protein,
             self.dayhoff,
             self.hp,
@@ -481,7 +481,7 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False):
 
     def add_kmer(self, kmer):
         "Add a kmer into the sketch."
-        if self.is_dna:
+        if self.is_dna or self.skipm1n3 or self.skipm2n3:
             if len(kmer) != self.ksize:
                 raise ValueError(f"kmer to add is not {self.ksize} in length")
         else:
@@ -608,7 +608,7 @@ def skipm2n3(self):
     @property
     def ksize(self):
         k = self._methodcall(lib.kmerminhash_ksize)
-        if not self.is_dna:
+        if not self.is_dna and not self.skipm1n3 and not self.skipm2n3:
             assert k % 3 == 0
             k = int(k / 3)
         return k
@@ -1081,6 +1081,10 @@ def moltype(self):  # TODO: test in minhash tests
             return "dayhoff"
         elif self.hp:
             return "hp"
+        elif self.skipm1n3:
+            return "skipm1n3"
+        elif self.skipm2n3:
+            return "skipm2n3"
         else:
             return "DNA"
 

From 379633ae6a5caca52c15a619ff3c18aa35f9b0cc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 19:19:10 +0000
Subject: [PATCH 20/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/sourmash/minhash.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
index d7ed0a03d4..ce3c0ed599 100644
--- a/src/sourmash/minhash.py
+++ b/src/sourmash/minhash.py
@@ -291,7 +291,9 @@ def __getstate__(self):
         # get a ksize that makes sense to the Rust layer. See #2262.
         return (
             self.num,
-            self.ksize if self.is_dna or self.skipm1n3 or self.skipm2n3 else self.ksize * 3,
+            self.ksize
+            if self.is_dna or self.skipm1n3 or self.skipm2n3
+            else self.ksize * 3,
             self.is_protein,
             self.dayhoff,
             self.hp,

From b6601f656d9a86b01b02236a734a0cff0ab753f6 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 20 Dec 2024 11:52:48 -0800
Subject: [PATCH 21/22] fix index for scaled

---
 src/sourmash/minhash.py      | 4 ++++
 src/sourmash/sig/__main__.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
index ce3c0ed599..c036fa6c3d 100644
--- a/src/sourmash/minhash.py
+++ b/src/sourmash/minhash.py
@@ -289,6 +289,10 @@ def __getstate__(self):
         # note: we multiple ksize by 3 here so that
         # pickle protocols that bypass __setstate__ <coff numpy coff>
         # get a ksize that makes sense to the Rust layer. See #2262.
+        # CTB/NTP note: if you add things below, you might want to put
+        # them at the end, because we use internal indexes in a few places.
+        # see especially `_set_num_scaled()` in sig/__main__.my.
+        # My apologies.
         return (
             self.num,
             self.ksize
diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py
index 0b97c84347..5809270284 100644
--- a/src/sourmash/sig/__main__.py
+++ b/src/sourmash/sig/__main__.py
@@ -102,7 +102,7 @@ def _set_num_scaled(mh, num, scaled):
     # Number of hashes is 0th parameter
     mh_params[0] = num
     # Scale is 8th parameter
-    mh_params[8] = _get_max_hash_for_scaled(scaled)
+    mh_params[10] = _get_max_hash_for_scaled(scaled)
     mh.__setstate__(mh_params)
     assert mh.num == num
     assert mh.scaled == scaled

From 017280a27bc03b934330c749348f6b0e9ecf327e Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 20 Dec 2024 12:03:37 -0800
Subject: [PATCH 22/22] rustfmt

---
 src/core/src/ffi/mod.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs
index 2a72fd549e..dc8fb14c76 100644
--- a/src/core/src/ffi/mod.rs
+++ b/src/core/src/ffi/mod.rs
@@ -43,7 +43,8 @@ pub enum HashFunctions {
 impl From<HashFunctions> for crate::encodings::HashFunctions {
     fn from(v: HashFunctions) -> crate::encodings::HashFunctions {
         use crate::encodings::HashFunctions::{
-            Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, Murmur64Skipm2n3,
+            Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3,
+            Murmur64Skipm2n3,
         };
         match v {
             HashFunctions::Murmur64Dna => Murmur64Dna,
@@ -59,7 +60,8 @@ impl From<HashFunctions> for crate::encodings::HashFunctions {
 impl From<crate::encodings::HashFunctions> for HashFunctions {
     fn from(v: crate::encodings::HashFunctions) -> HashFunctions {
         use crate::encodings::HashFunctions::{
-            Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, Murmur64Skipm2n3,
+            Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3,
+            Murmur64Skipm2n3,
         };
         match v {
             Murmur64Dna => HashFunctions::Murmur64Dna,