Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Use a Nodegraph for searching in internal nodes #1138

Closed
wants to merge 12 commits into from
Prev Previous commit
Next Next commit
fix nodegraph containment
  • Loading branch information
luizirber committed Apr 18, 2021
commit 9531a23b81d58c0f88f7bb1220c9f0fef664e4ba
2 changes: 1 addition & 1 deletion src/core/src/sketch/nodegraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ impl Comparable<Nodegraph> for Nodegraph {
.zip(&other.bs)
.map(|(bs, bs_other)| bs.intersection(bs_other).count())
.sum();
let size: usize = self.bs.iter().map(|bs| bs.len()).sum();
let size: usize = self.bs.iter().map(|bs| bs.ones().count()).sum();
result as f64 / size as f64
}
}
Expand Down
13 changes: 3 additions & 10 deletions src/sourmash/sbtmh.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,6 @@ def _max_jaccard_underneath_internal_node(node, query):
# J(A, B) = |A intersection B| / |A union B|
# If we use only |A| as denominator, it is the containment
# Because |A| <= |A union B|, it is also an upper bound on the max jaccard

# count the maximum number of hash matches beneath this node
#matches = node.data.matches(mh)
#max_score = float(matches) / len(mh)

max_score = query_bf.containment(node.data)

return max_score
Expand Down Expand Up @@ -151,7 +146,6 @@ def search_minhashes_containment(node, sig, threshold, results=None, downsample=
else: # Node or Leaf, Nodegraph by minhash comparison
bf = _get_bf(node, sig)
matches = bf.containment(node.data) * len(mh)
#matches = node.data.matches(mh)

if len(mh) and float(matches) / len(mh) >= threshold:
return 1
Expand All @@ -166,14 +160,14 @@ def search_minhashes_max_containment(node, sig, threshold, results=None,

if isinstance(node, SigLeaf):
node_mh = node.data.minhash

matches = node_mh.count_common(mh, downsample)
node_size = len(node_mh)
else: # Node or Leaf, Nodegraph by minhash comparison
bf = _get_bf(node, sig)
matches = bf.containment(node.data) * len(mh)
node_size = len(mh) # FIXME

#matches = node.data.matches(mh)
bf = _get_bf(node, sig)
matches = bf.containment(node.data) * len(mh)

denom = min((len(mh), node_size))

Expand All @@ -199,7 +193,6 @@ def search(self, node, query, threshold, results=None):
else: # Nodegraph by minhash comparison
bf = _get_bf(node, query)
matches = bf.containment(node.data) * len(mh)
#matches = node.data.matches(mh)

if not matches:
return 0
Expand Down