From 40d833683539f8b2b3c827568236c1af0fc16b3b Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 19 Aug 2023 19:56:54 -0700 Subject: [PATCH] [skip ci] Start implementing FFI for manifest, picklist and selection --- include/sourmash.h | 67 +++++++++++++ src/core/src/ffi/index/mod.rs | 165 +++++++++++++++++++++++++++++++++ src/core/src/ffi/manifest.rs | 73 +++++++++++++++ src/core/src/ffi/mod.rs | 2 + src/core/src/ffi/picklist.rs | 89 ++++++++++++++++++ src/sourmash/index/__init__.py | 100 +++++++++++++++++--- src/sourmash/manifest.py | 35 +++++++ src/sourmash/picklist.py | 18 ++++ 8 files changed, 537 insertions(+), 12 deletions(-) create mode 100644 src/core/src/ffi/manifest.rs create mode 100644 src/core/src/ffi/picklist.rs diff --git a/include/sourmash.h b/include/sourmash.h index 6fa7854880..2ff83cfcfd 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -51,14 +51,24 @@ typedef struct SourmashHyperLogLog SourmashHyperLogLog; typedef struct SourmashKmerMinHash SourmashKmerMinHash; +typedef struct SourmashManifest SourmashManifest; + +typedef struct SourmashManifestRowIter SourmashManifestRowIter; + typedef struct SourmashNodegraph SourmashNodegraph; +typedef struct SourmashPicklist SourmashPicklist; + typedef struct SourmashRevIndex SourmashRevIndex; typedef struct SourmashSearchResult SourmashSearchResult; +typedef struct SourmashSelection SourmashSelection; + typedef struct SourmashSignature SourmashSignature; +typedef struct SourmashSignatureIter SourmashSignatureIter; + typedef struct SourmashZipStorage SourmashZipStorage; /** @@ -79,6 +89,15 @@ typedef struct { bool owned; } SourmashStr; +typedef struct { + uint32_t ksize; + uint8_t with_abundance; + SourmashStr md5; + SourmashStr internal_location; + SourmashStr name; + SourmashStr moltype; +} SourmashManifestRow; + bool computeparams_dayhoff(const SourmashComputeParameters *ptr); bool computeparams_dna(const SourmashComputeParameters *ptr); @@ -265,6 +284,10 @@ void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize); bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); +SourmashManifestRowIter *manifest_rows(const SourmashManifest *ptr); + +const SourmashManifestRow *manifest_rows_iter_next(SourmashManifestRowIter *ptr); + void nodegraph_buffer_free(uint8_t *ptr, uintptr_t insize); bool nodegraph_count(SourmashNodegraph *ptr, uint64_t h); @@ -309,6 +332,18 @@ SourmashNodegraph *nodegraph_with_tables(uintptr_t ksize, uintptr_t starting_size, uintptr_t n_tables); +void picklist_free(SourmashPicklist *ptr); + +SourmashPicklist *picklist_new(void); + +void picklist_set_coltype(SourmashPicklist *ptr, const char *coltype_ptr, uintptr_t insize); + +void picklist_set_column_name(SourmashPicklist *ptr, const char *prop_ptr, uintptr_t insize); + +void picklist_set_pickfile(SourmashPicklist *ptr, const char *prop_ptr, uintptr_t insize); + +void picklist_set_pickstyle(SourmashPicklist *ptr, PickStyle pickstyle); + void revindex_free(SourmashRevIndex *ptr); const SourmashSearchResult *const *revindex_gather(const SourmashRevIndex *ptr, @@ -354,6 +389,36 @@ double searchresult_score(const SourmashSearchResult *ptr); SourmashSignature *searchresult_signature(const SourmashSearchResult *ptr); +bool selection_abund(const SourmashSelection *ptr); + +bool selection_containment(const SourmashSelection *ptr); + +uint32_t selection_ksize(const SourmashSelection *ptr); + +HashFunctions selection_moltype(const SourmashSelection *ptr); + +SourmashSelection *selection_new(void); + +uint32_t selection_num(const SourmashSelection *ptr); + +const SourmashPicklist *selection_picklist(const SourmashSelection *ptr); + +uint32_t selection_scaled(const SourmashSelection *ptr); + +void selection_set_abund(SourmashSelection *ptr, bool new_abund); + +void selection_set_containment(SourmashSelection *ptr, bool new_containment); + +void selection_set_ksize(SourmashSelection *ptr, uint32_t new_ksize); + +void selection_set_moltype(SourmashSelection *ptr, HashFunctions new_moltype); + +void selection_set_num(SourmashSelection *ptr, uint32_t new_num); + +void selection_set_picklist(SourmashSelection *ptr, SourmashPicklist *new_picklist); + +void selection_set_scaled(SourmashSelection *ptr, uint32_t new_scaled); + void signature_add_protein(SourmashSignature *ptr, const char *sequence); void signature_add_sequence(SourmashSignature *ptr, const char *sequence, bool force); @@ -388,6 +453,8 @@ void signature_set_mh(SourmashSignature *ptr, const SourmashKmerMinHash *other); void signature_set_name(SourmashSignature *ptr, const char *name); +const SourmashSignature *signatures_iter_next(SourmashSignatureIter *ptr); + SourmashSignature **signatures_load_buffer(const char *ptr, uintptr_t insize, bool _ignore_md5sum, diff --git a/src/core/src/ffi/index/mod.rs b/src/core/src/ffi/index/mod.rs index 932a97b222..af88ec8981 100644 --- a/src/core/src/ffi/index/mod.rs +++ b/src/core/src/ffi/index/mod.rs @@ -1,7 +1,11 @@ pub mod revindex; +use crate::encodings::HashFunctions; +use crate::index::{Selection, SigStore}; + use crate::signature::Signature; +use crate::ffi::picklist::SourmashPicklist; use crate::ffi::signature::SourmashSignature; use crate::ffi::utils::{ForeignObject, SourmashStr}; @@ -35,3 +39,164 @@ pub unsafe extern "C" fn searchresult_signature( let result = SourmashSearchResult::as_rust(ptr); SourmashSignature::from_rust((result.1).clone()) } + +//================================================================ + +pub struct SourmashSelection; + +impl ForeignObject for SourmashSelection { + type RustObject = Selection; +} + +#[no_mangle] +pub unsafe extern "C" fn selection_new() -> *mut SourmashSelection { + SourmashSelection::from_rust(Selection::default()) +} + +#[no_mangle] +pub unsafe extern "C" fn selection_ksize(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(ksize) = sel.ksize() { + ksize + } else { + todo!("empty ksize case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_ksize(ptr: *mut SourmashSelection, new_ksize: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_ksize(new_ksize); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_num(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(num) = sel.num() { + num + } else { + todo!("empty num case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_num(ptr: *mut SourmashSelection, new_num: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_num(new_num); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_scaled(ptr: *const SourmashSelection) -> u32 { + let sel = SourmashSelection::as_rust(ptr); + if let Some(scaled) = sel.scaled() { + scaled + } else { + todo!("empty scaled case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_scaled(ptr: *mut SourmashSelection, new_scaled: u32) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_scaled(new_scaled); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_containment(ptr: *const SourmashSelection) -> bool { + let sel = SourmashSelection::as_rust(ptr); + if let Some(containment) = sel.containment() { + containment + } else { + todo!("empty scaled case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_containment( + ptr: *mut SourmashSelection, + new_containment: bool, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_containment(new_containment); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_abund(ptr: *const SourmashSelection) -> bool { + let sel = SourmashSelection::as_rust(ptr); + if let Some(abund) = sel.abund() { + abund + } else { + todo!("empty abund case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_abund(ptr: *mut SourmashSelection, new_abund: bool) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_abund(new_abund); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_moltype(ptr: *const SourmashSelection) -> HashFunctions { + let sel = SourmashSelection::as_rust(ptr); + if let Some(hash_function) = sel.moltype() { + hash_function + } else { + todo!("empty hash_function case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_moltype( + ptr: *mut SourmashSelection, + new_moltype: HashFunctions, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + sel.set_moltype(new_moltype); +} + +#[no_mangle] +pub unsafe extern "C" fn selection_picklist( + ptr: *const SourmashSelection, +) -> *const SourmashPicklist { + let sel = SourmashSelection::as_rust(ptr); + if let Some(picklist) = sel.picklist() { + SourmashPicklist::from_rust(picklist) + } else { + todo!("empty picklist case not supported yet") + } +} + +#[no_mangle] +pub unsafe extern "C" fn selection_set_picklist( + ptr: *mut SourmashSelection, + new_picklist: *mut SourmashPicklist, +) { + let sel = SourmashSelection::as_rust_mut(ptr); + let pick = SourmashPicklist::into_rust(new_picklist); + sel.set_picklist(*pick); +} + +//================================================================ +// +pub struct SignatureIterator { + iter: Box>>, +} + +pub struct SourmashSignatureIter; + +impl ForeignObject for SourmashSignatureIter { + type RustObject = SignatureIterator; +} + +#[no_mangle] +pub unsafe extern "C" fn signatures_iter_next( + ptr: *mut SourmashSignatureIter, +) -> *const SourmashSignature { + let iterator = SourmashSignatureIter::as_rust_mut(ptr); + + match iterator.iter.next() { + Some(sig) => SourmashSignature::from_rust(sig.into()), + None => std::ptr::null(), + } +} diff --git a/src/core/src/ffi/manifest.rs b/src/core/src/ffi/manifest.rs new file mode 100644 index 0000000000..815f8d83f1 --- /dev/null +++ b/src/core/src/ffi/manifest.rs @@ -0,0 +1,73 @@ +use crate::manifest::{Manifest, Record}; + +use crate::ffi::utils::{ForeignObject, SourmashStr}; + +pub struct SourmashManifest; + +impl ForeignObject for SourmashManifest { + type RustObject = Manifest; +} + +pub struct ManifestRowIterator { + iter: Box>, +} + +pub struct SourmashManifestRowIter; + +impl ForeignObject for SourmashManifestRowIter { + type RustObject = ManifestRowIterator; +} + +#[no_mangle] +pub unsafe extern "C" fn manifest_rows_iter_next( + ptr: *mut SourmashManifestRowIter, +) -> *const SourmashManifestRow { + let iterator = SourmashManifestRowIter::as_rust_mut(ptr); + + match iterator.iter.next() { + Some(row) => SourmashManifestRow::from_rust(row.into()), + None => std::ptr::null(), + } +} + +#[no_mangle] +pub unsafe extern "C" fn manifest_rows( + ptr: *const SourmashManifest, +) -> *mut SourmashManifestRowIter { + let manifest = SourmashManifest::as_rust(ptr); + + let iter = Box::new(manifest.iter()); + SourmashManifestRowIter::from_rust(ManifestRowIterator { iter }) +} + +#[repr(C)] +pub struct SourmashManifestRow { + pub ksize: u32, + pub with_abundance: u8, + pub md5: SourmashStr, + pub internal_location: SourmashStr, + pub name: SourmashStr, + pub moltype: SourmashStr, +} + +impl ForeignObject for SourmashManifestRow { + type RustObject = SourmashManifestRow; +} + +impl From<&Record> for SourmashManifestRow { + fn from(record: &Record) -> SourmashManifestRow { + Self { + ksize: record.ksize(), + with_abundance: record.with_abundance() as u8, + md5: record.md5().into(), + name: record.name().into(), + moltype: record.moltype().to_string().into(), + internal_location: record + .internal_location() + .to_str() + .unwrap() + .to_owned() + .into(), + } + } +} diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index a67de37176..44e856001f 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -9,8 +9,10 @@ pub mod utils; pub mod cmd; pub mod hyperloglog; pub mod index; +pub mod manifest; pub mod minhash; pub mod nodegraph; +pub mod picklist; pub mod signature; pub mod storage; diff --git a/src/core/src/ffi/picklist.rs b/src/core/src/ffi/picklist.rs new file mode 100644 index 0000000000..c7bea755ae --- /dev/null +++ b/src/core/src/ffi/picklist.rs @@ -0,0 +1,89 @@ +use std::os::raw::c_char; +use std::slice; + +use crate::picklist::{PickStyle, Picklist}; + +use crate::ffi::utils::ForeignObject; + +pub struct SourmashPicklist; + +impl ForeignObject for SourmashPicklist { + type RustObject = Picklist; +} + +#[no_mangle] +pub unsafe extern "C" fn picklist_new() -> *mut SourmashPicklist { + SourmashPicklist::from_rust(Picklist::default()) +} + +#[no_mangle] +pub unsafe extern "C" fn picklist_free(ptr: *mut SourmashPicklist) { + SourmashPicklist::drop(ptr); +} + +ffi_fn! { +unsafe fn picklist_set_coltype( + ptr: *mut SourmashPicklist, + coltype_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let coltype = { + assert!(!coltype_ptr.is_null()); + let coltype = slice::from_raw_parts(coltype_ptr as *mut u8, insize); + std::str::from_utf8(coltype)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_coltype(coltype.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_pickfile( + ptr: *mut SourmashPicklist, + prop_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let prop = { + assert!(!prop_ptr.is_null()); + let prop = slice::from_raw_parts(prop_ptr as *mut u8, insize); + std::str::from_utf8(prop)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_pickfile(prop.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_column_name( + ptr: *mut SourmashPicklist, + prop_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let prop = { + assert!(!prop_ptr.is_null()); + let prop = slice::from_raw_parts(prop_ptr as *mut u8, insize); + std::str::from_utf8(prop)? + }; + let pl = SourmashPicklist::as_rust_mut(ptr); + pl.set_column_name(prop.to_string()); + + Ok(()) +} +} + +ffi_fn! { +unsafe fn picklist_set_pickstyle( + ptr: *mut SourmashPicklist, + pickstyle: PickStyle, +) -> Result<()> { + let pl = SourmashPicklist::as_rust_mut(ptr); + + pl.set_pickstyle(pickstyle); + + Ok(()) +} +} diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 08068255e5..c4f50bf8e5 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -34,10 +34,14 @@ CounterGather - an ancillary class returned by the 'counter_gather()' method. """ +from __future__ import annotations + import os import sourmash from abc import abstractmethod, ABC -from collections import namedtuple, Counter +from collections import defaultdict, Counter +from typing import NamedTuple, Optional, TypedDict, TYPE_CHECKING +import weakref from sourmash.search import (make_jaccard_search_query, make_containment_query, @@ -45,12 +49,78 @@ from sourmash.manifest import CollectionManifest from sourmash.logging import debug_literal from sourmash.signature import load_signatures, save_signatures +from sourmash._lowlevel import ffi, lib +from sourmash.utils import RustObject, rustcall, decode_str, encode_str +from sourmash import SourmashSignature +from sourmash.picklist import SignaturePicklist from sourmash.minhash import (flatten_and_downsample_scaled, flatten_and_downsample_num, flatten_and_intersect_scaled) -# generic return tuple for Index.search and Index.gather -IndexSearchResult = namedtuple('Result', 'score, signature, location') +if TYPE_CHECKING: + from typing_extensions import Unpack + + +class IndexSearchResult(NamedTuple): + """generic return tuple for Index.search and Index.gather""" + score: float + signature: SourmashSignature + location: str + + +class Selection(TypedDict): + ksize: Optional[int] + moltype: Optional[str] + num: Optional[int] + scaled: Optional[int] + containment: Optional[bool] + abund: Optional[bool] + picklist: Optional[SignaturePicklist] + + +# TypedDict can't have methods (it is a dict in runtime) +def _selection_as_rust(selection: Selection): + ptr = lib.selection_new() + + for key, v in selection.items(): + if v is not None: + if key == "ksize": + rustcall(lib.selection_set_ksize, ptr, v) + + elif key == "moltype": + hash_function = None + if v.lower() == "dna": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA + elif v.lower() == "protein": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN + elif v.lower() == "dayhoff": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF + elif v.lower() == "hp": + hash_function = lib.HASH_FUNCTIONS_MURMUR64_HP + + rustcall(lib.selection_set_moltype, ptr, hash_function) + + elif key == "num": + rustcall(lib.selection_set_num, ptr, v) + + elif key == "scaled": + rustcall(lib.selection_set_scaled, ptr, v) + + elif key == "containment": + rustcall(lib.selection_set_containment, ptr, v) + + elif key == "abund": + rustcall(lib.selection_set_abund, ptr, bool(v)) + + elif key == "picklist": + picklist_ptr = v._as_rust() + rustcall(lib.selection_set_picklist, ptr, picklist_ptr) + + else: + raise KeyError(f"Unsupported key {key} for Selection in rust") + + return ptr + class Index(ABC): # this will be removed soon; see sourmash#1894. @@ -307,8 +377,7 @@ def counter_gather(self, query, threshold_bp, **kwargs): return counter @abstractmethod - def select(self, ksize=None, moltype=None, scaled=None, num=None, - abund=None, containment=None): + def select(self, **kwargs: Unpack[Selection]): """Return Index containing only signatures that match requirements. Current arguments can be any or all of: @@ -326,9 +395,16 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None, """ -def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0, - containment=False, abund=None, picklist=None): +def select_signature(ss, **kwargs: Unpack[Selection]): "Check that the given signature matches the specified requirements." + ksize = kwargs.get('ksize') + moltype = kwargs.get('moltype') + scaled = kwargs.get('scaled', 0) + num = kwargs.get('num', 0) + containment = kwargs.get('containment', False) + abund = kwargs.get('abund') + picklist = kwargs.get('picklist') + # ksize match? if ksize and ksize != ss.minhash.ksize: return False @@ -408,7 +484,7 @@ def load(cls, location, filename=None): lidx = LinearIndex(si, filename=filename) return lidx - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): """Return new LinearIndex containing only signatures that match req's. Does not raise ValueError, but may return an empty Index. @@ -479,7 +555,7 @@ def save(self, path): def load(cls, path): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): """Return new object yielding only signatures that match req's. Does not raise ValueError, but may return an empty Index. @@ -642,7 +718,7 @@ def signatures(self): if select(ss): yield ss - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Select signatures in zip file based on ksize/moltype/etc." # if we have a manifest, run 'select' on the manifest. @@ -1053,7 +1129,7 @@ def load_from_pathlist(cls, filename): def save(self, *args): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) return MultiIndex(new_manifest, self.parent, @@ -1162,7 +1238,7 @@ def save(self, *args): def insert(self, *args): raise NotImplementedError - def select(self, **kwargs): + def select(self, **kwargs: Unpack[Selection]): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) return StandaloneManifestIndex(new_manifest, self._location, diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index bfd27eabb9..87331af9ec 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -7,9 +7,13 @@ import os.path from abc import abstractmethod import itertools +from typing import TYPE_CHECKING from sourmash.picklist import SignaturePicklist +if TYPE_CHECKING: + from typing_extensions import Unpack + class BaseCollectionManifest: """ @@ -343,3 +347,34 @@ def to_picklist(self): picklist.pickset = set(self._md5_set) return picklist + + @staticmethod + def _from_rust(value): + from ._lowlevel import ffi, lib + from .utils import rustcall, decode_str + + iterator = rustcall(lib.manifest_rows, value) + + rows = [] + next_row = rustcall(lib.manifest_rows_iter_next, iterator) + while next_row != ffi.NULL: + + # TODO: extract row data from next_row + # FIXME: free mem from strings? + row = {} + row['md5'] = decode_str(next_row.md5) + row['md5short'] = row['md5'][:8] + row['ksize'] = next_row.ksize + row['moltype'] = decode_str(next_row.moltype) + row['num'] = 0 #ss.minhash.num + row['scaled'] = 0 #ss.minhash.scaled + row['n_hashes'] = 0 # len(ss.minhash) + row['with_abundance'] = next_row.with_abundance + row['name'] = decode_str(next_row.name) + row['filename'] = "" #ss.filename + row['internal_location'] = decode_str(next_row.internal_location) + rows.append(row) + + next_row = rustcall(lib.manifest_rows_iter_next, iterator) + + return CollectionManifest(rows) diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py index 30d5c84f90..af15df0990 100644 --- a/src/sourmash/picklist.py +++ b/src/sourmash/picklist.py @@ -252,6 +252,24 @@ def filter(self, it): if self.__contains__(ss): yield ss + def _as_rust(self): + from ._lowlevel import ffi, lib + from .utils import rustcall, decode_str + + ptr = lib.picklist_new() + + rustcall(lib.picklist_set_coltype, ptr, self.coltype.encode('utf-8'), len(self.coltype)) + rustcall(lib.picklist_set_pickfile, ptr, self.pickfile.encode('utf-8'), len(self.pickfile)) + rustcall(lib.picklist_set_column_name, ptr, self.column_name.encode('utf-8'), len(self.column_name)) + rustcall(lib.picklist_set_pickstyle, ptr, self.pickstyle.value) + + #self.preprocess_fn = preprocess[coltype] + #self.pickset = None + #self.found = set() + #self.n_queries = 0 + + return ptr + def passes_all_picklists(ss, picklists): "does the signature 'ss' pass all of the picklists?"