Skip to content

Commit

Permalink
Fix newlines in sketch kmers
Browse files Browse the repository at this point in the history
  • Loading branch information
audy authored Jan 15, 2024
2 parents fc85af0 + 1539f73 commit 47850b0
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 15 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ path = "src/main.rs"

[dependencies]
clap = "2.33.0"
finch = "0.6"
finch = { version = "0.6.1", path = "../lib" }
serde_json = "1"
anyhow = "1"

Expand Down
12 changes: 9 additions & 3 deletions cli/tests/data/query.fa
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
>id1
AAGGCCTAACTTAATAGGCCCGTATAAATTAGAGCTAGCTAGCTAGTCGATCGATGCTAGCTACGTAACGTCGCGATCGATTATATTCCTCGGCGCGATATTCGCTAGCTAGCTAGCTAGTCATCGATGCATG
AAGGCCTAACTTAATAGGCCCGTATAAATTAGAGCTAGCTAGCTAGTCGATCGATGCTAGCTAC
GTAACGTCGCGATCGATTATATTCCTCGGCGCGATATTCGCTAGCTAGCTAGCTAGTCATCGATGCATG
>id2
AAGGCCTAACTTAATAGGCCCGTATAAATTAGAAGTAGATGATAGATGTGTAGTAGTAGACCGGCTCAGCATCTGAGTCATGCTAGTCGATCGATCGTAGCTAGCGATGCTAGCTAGCTAGCTGTAGCTGCAT
AAGGCCTAACTTAATAGGCCCGTATAAATT
AGAAGTAGATGAT
AGATGTGTAGTAGTAGACCGGCTCAGCATCTGAGTCATGCTAGTCGATCGATCGTAGCTAGCGATGCTAGCTAGCTAGC
TGTAGCTGCAT
>id3
GATGCTGCATGCTACAGCGATGCTAGCTGATGCTAGCTACATTACGAGGCGGCATCTATCAGTCGATCGATCGTAGCTGATCGATCGATGCTGATCGATGCTACATGCTCAGTCGATGCTAGCATGTCAGTCG
GATGCTGCATGCTACA
GCGATGCTAGCTGATGCTAGCTACATTACGAGGCGGC
ATCTATCAGTCGATCGATCGTAGCTGATCGATCGATGCTGATCGATGCTACATGCTCAGTCGATGCTAGCATGTCAGTCG
49 changes: 49 additions & 0 deletions cli/tests/test_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,55 @@ fn finch_sketch_scaled() -> Result<(), Box<dyn std::error::Error>> {
assert_eq!(sketch["kmer"], 21);
assert_eq!(sketch["alphabet"], "ACGT");
assert_eq!(sketch["sketchSize"], 10);

let kmers = &sketch["sketches"][0]["kmers"];

assert_eq!(kmers[0], "ATGCTAGCTACGTAACGTCGC");
assert_eq!(kmers[1], "CAGTCGATCGATCGTAGCTGA");
assert_eq!(kmers[2], "CTCAGATGCTGAGCCGGTCTA");
assert_eq!(kmers[3], "GCTAGCTAGCATCGCTAGCTA");
assert_eq!(kmers[4], "GACTAGCTAGCTAGCTAGCGA");
assert_eq!(kmers[5], "CGCTAGCTACGATCGATCGAC");
assert_eq!(kmers[6], "TAATTTATACGGGCCTATTAA");
assert_eq!(kmers[7], "GCATCAGCTAGCATCGCTGTA");
assert_eq!(kmers[8], "AGCCGGTCTACTACTACACAT");
assert_eq!(kmers[9], "AAGGCCTAACTTAATAGGCCC");

//assert_eq!(sketch["max_hash"], usize::max_value() / 1000);
assert_eq!(sketch["hashSeed"], 0);

Ok(())
}

#[test]
fn finch_sketch_mash() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin("finch")?;
cmd.arg("sketch")
.args(&["--n-hashes", "10"])
.args(&["--sketch-type", "mash"])
.arg("tests/data/query.fa")
.arg("-O");
cmd.assert().success();

let output = Cursor::new(cmd.output().unwrap().stdout);
let sketch: serde_json::Value = serde_json::from_reader(output)?;
assert_eq!(sketch["kmer"], 21);
assert_eq!(sketch["alphabet"], "ACGT");
assert_eq!(sketch["sketchSize"], 10);

let kmers = &sketch["sketches"][0]["kmers"];

assert_eq!(kmers[0], "ATGCTAGCTACGTAACGTCGC");
assert_eq!(kmers[1], "CAGTCGATCGATCGTAGCTGA");
assert_eq!(kmers[2], "CTCAGATGCTGAGCCGGTCTA");
assert_eq!(kmers[3], "GCTAGCTAGCATCGCTAGCTA");
assert_eq!(kmers[4], "GACTAGCTAGCTAGCTAGCGA");
assert_eq!(kmers[5], "CGCTAGCTACGATCGATCGAC");
assert_eq!(kmers[6], "TAATTTATACGGGCCTATTAA");
assert_eq!(kmers[7], "GCATCAGCTAGCATCGCTGTA");
assert_eq!(kmers[8], "AGCCGGTCTACTACTACACAT");
assert_eq!(kmers[9], "AAGGCCTAACTTAATAGGCCC");

//assert_eq!(sketch["max_hash"], usize::max_value() / 1000);
assert_eq!(sketch["hashSeed"], 0);

Expand Down
2 changes: 1 addition & 1 deletion lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "finch"
version = "0.6.0"
version = "0.6.1"
authors = ["One Codex <vincent@onecodex.com>"]
description = "An implementation of min-wise independent permutation locality sensitive hashing ('MinHashing') for genomic data and command-line utility for manipulation."
keywords = ["minhash", "bioinformatics", "sketches"]
Expand Down
1 change: 0 additions & 1 deletion lib/src/serialization/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize};
use crate::bail;
use crate::errors::FinchResult;
use crate::filtering::FilterParams;
pub use crate::serialization::mash::{read_mash_file, write_mash_file};
use crate::serialization::Sketch;
use crate::sketch_schemes::{KmerCount, SketchParams};

Expand Down
8 changes: 4 additions & 4 deletions lib/src/sketch_schemes/mash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ impl SketchScheme for MashSketcher {
'seq: 'inner,
{
self.total_bases += seq.sequence().len() as u64;
let rc = seq.reverse_complement();
for (_, kmer, is_rev_complement) in
seq.normalize(false).canonical_kmers(self.kmer_length, &rc)
{
let norm_seq = seq.normalize(false);

let rc = norm_seq.reverse_complement();
for (_, kmer, is_rev_complement) in norm_seq.canonical_kmers(self.kmer_length, &rc) {
let rc_count = u8::from(is_rev_complement);
self.push(kmer, rc_count);
}
Expand Down
8 changes: 4 additions & 4 deletions lib/src/sketch_schemes/scaled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ impl SketchScheme for ScaledSketcher {
'seq: 'inner,
{
self.total_bases += seq.sequence().len() as u64;
let rc = seq.reverse_complement();
for (_, kmer, is_rev_complement) in
seq.normalize(false).canonical_kmers(self.kmer_length, &rc)
{
let norm_seq = seq.normalize(false);

let rc = norm_seq.reverse_complement();
for (_, kmer, is_rev_complement) in norm_seq.canonical_kmers(self.kmer_length, &rc) {
let rc_count = u8::from(is_rev_complement);
self.push(kmer, rc_count);
}
Expand Down

0 comments on commit 47850b0

Please sign in to comment.