Skip to content

Commit

Permalink
MRG: in core, enable downsample within select (#2931)
Browse files Browse the repository at this point in the history
Attempting #1292 in order to move forward
sourmash-bio/sourmash_plugin_branchwater#134

Modifies `Signature` `Select` to downsample automatically.

- for scaled sketches, while checking ksize, we also retain only
sketches that have the right scaled or can be downsampled (scaled <=
selection.scaled())
- next, we iterate through the sketches and downsample any where scaled
< selection.scaled()

Note that for `sourmash_plugin_branchwater` compatibility, we need:

- `byteorder` = "1.4.3"
- `wasm-bindgen` = "0.2.89"
- `once_cell` = "1.18.0"
---------

Co-authored-by: Luiz Irber <luizirber@users.noreply.github.com>
  • Loading branch information
bluegenes and luizirber authored Jan 23, 2024
1 parent a0f3016 commit 94b88cc
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 5 deletions.
6 changes: 3 additions & 3 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ default = []

[dependencies]
az = "1.0.0"
byteorder = "1.5.0"
byteorder = "1.4.3"
camino = { version = "1.1.6", features = ["serde1"] }
cfg-if = "1.0"
counter = "0.5.7"
Expand All @@ -45,7 +45,7 @@ murmurhash3 = "0.0.5"
niffler = { version = "2.3.1", default-features = false, features = [ "gz" ] }
nohash-hasher = "0.2.0"
num-iter = "0.1.43"
once_cell = "1.19.0"
once_cell = "1.18.0"
ouroboros = "0.18.3"
piz = "0.5.0"
primal-check = "0.3.1"
Expand Down Expand Up @@ -88,7 +88,7 @@ skip_feature_sets = [
## Wasm section. Crates only used for WASM, as well as specific configurations

[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.wasm-bindgen]
version = "0.2.90"
version = "0.2.89"
features = ["serde-serialize"]

[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.web-sys]
Expand Down
69 changes: 67 additions & 2 deletions src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -771,8 +771,14 @@ impl Select for Signature {
} else {
valid
};
// TODO: execute downsample if needed

// keep compatible scaled if applicable
if let Some(sel_scaled) = selection.scaled() {
valid = if let Sketch::MinHash(mh) = s {
valid && mh.scaled() <= sel_scaled as u64
} else {
valid
};
}
/*
valid = if let Some(abund) = selection.abund() {
valid && *s.with_abundance() == abund
Expand All @@ -785,8 +791,20 @@ impl Select for Signature {
valid
};
*/

valid
});

// downsample the retained sketches if needed.
if let Some(sel_scaled) = selection.scaled() {
for sketch in self.signatures.iter_mut() {
if let Sketch::MinHash(mh) = sketch {
if (mh.scaled() as u32) < sel_scaled {
*sketch = Sketch::MinHash(mh.downsample_scaled(sel_scaled as u64)?);
}
}
}
}
Ok(self)
}
}
Expand Down Expand Up @@ -841,6 +859,10 @@ mod test {

use super::Signature;

use crate::prelude::Select;
use crate::selection::Selection;
use crate::sketch::Sketch;

#[test]
fn load_sig() {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
Expand Down Expand Up @@ -979,4 +1001,47 @@ mod test {
assert_eq!(sk.size(), 500);
}
}

#[test]
fn selection_with_downsample() {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/47+63-multisig.sig");

let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");

// create Selection object
let mut selection = Selection::default();
selection.set_scaled(2000);
// iterate and check scaled
for sig in &sigs {
let modified_sig = sig.clone().select(&selection).unwrap();
for sketch in modified_sig.sketches() {
if let Sketch::MinHash(mh) = sketch {
dbg!("scaled: {:?}", mh.scaled());
assert_eq!(mh.scaled(), 2000);
}
}
}
}

#[test]
fn selection_scaled_too_low() {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/47+63-multisig.sig");

let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");

// create Selection object
let mut selection = Selection::default();
selection.set_scaled(100);
// iterate and check no sigs are returned (original scaled is 1000)
for sig in &sigs {
let modified_sig = sig.clone().select(&selection).unwrap();
assert_eq!(modified_sig.size(), 0);
}
}
}

0 comments on commit 94b88cc

Please sign in to comment.