Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove radix tries in favour of simple hashes #13

Merged
merged 4 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
TODO:

- Bool
- Ints
- Floats
85 changes: 85 additions & 0 deletions engine.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package expr

import (
"context"
)

type EngineType int

const (
EngineTypeNone = iota

EngineTypeStringHash
EngineTypeNullMatch
// EngineTypeART
// EngineTypeBTree
)

// MatchingEngine represents an engine (such as a b-tree, radix trie, or
// simple hash map) which matches a predicate over many expressions.
type MatchingEngine interface {
// Type returns the EngineType
Type() EngineType
// Match takes an input event, containing key:value pairs of data, and
// matches the given data to any ExpressionParts stored in the engine.
//
// Each implementation of the engine may differ on granularity of
// expression parts received. Some may return false positives, but
// each MatchingEngine should NEVER omit ExpressionParts which match
// the given input.
Match(ctx context.Context, input map[string]any) ([]*ExpressionPart, error)
// Add adds a new expression part to the matching engine for future matches.
Add(ctx context.Context, p ExpressionPart) error
// Remove removes an expression part from the matching engine, ensuring that the
// ExpressionPart will not be matched in the future.
Remove(ctx context.Context, p ExpressionPart) error

// Search searches for a given variable<>value match, returning any expression
// parts that match.
//
// Similar to match, each implementation of the engine may differ on
// granularity of expression parts received. Some may return false positives by
// ignoring the variable name. Note that each MatchingEngine should NEVER
// omit ExpressionParts which match the given input; false positives are okay,
// but not returning valid matches must be impossible.
Search(ctx context.Context, variable string, input any) []*ExpressionPart
}

// Leaf represents the leaf within a tree. This stores all expressions
// which match the given expression.
//
// For example, adding two expressions each matching "event.data == 'foo'"
// in an ART creates a leaf node with both evaluable expressions stored
// in Evals
//
// Note that there are many sub-clauses which need to be matched. Each
// leaf is a subset of a full expression. Therefore,
type Leaf struct {
Evals []*ExpressionPart
}

// ExpressionPart represents a predicate group which is part of an expression.
// All parts for the given group ID must evaluate to true for the predicate to
// be matched.
type ExpressionPart struct {
// GroupID represents a group ID for the expression part.
//
// Within an expression, multiple predicates may be chained with &&. Each
// of these must evaluate to `true` for an expression to match. Group IDs
// are shared amongst each predicate within an expression.
//
// This lets us determine whether the entire group has been matched.
GroupID groupID
Predicate Predicate
Parsed *ParsedExpression
}

func (p ExpressionPart) Equals(n ExpressionPart) bool {
if p.GroupID != n.GroupID {
return false
}
if p.Predicate.String() != n.Predicate.String() {
return false
}
return p.Parsed.Evaluable.GetExpression() == n.Parsed.Evaluable.GetExpression()
}
89 changes: 64 additions & 25 deletions tree_null.go → engine_null.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,47 +5,98 @@ import (
"sync"

"github.com/google/cel-go/common/operators"
"github.com/ohler55/ojg/jp"
"golang.org/x/sync/errgroup"
)

// TODO: Rename PredicateTrees as these may not be trees -.-
func newNullMatcher() PredicateTree {
func newNullMatcher() MatchingEngine {
return &nullLookup{
lock: &sync.RWMutex{},
null: map[string][]ExpressionPart{},
not: map[string][]ExpressionPart{},
lock: &sync.RWMutex{},
paths: map[string]struct{}{},
null: map[string][]*ExpressionPart{},
not: map[string][]*ExpressionPart{},
}
}

type nullLookup struct {
lock *sync.RWMutex
null map[string][]ExpressionPart
not map[string][]ExpressionPart

// paths stores all variable names as JSON paths used within the engine.
paths map[string]struct{}

null map[string][]*ExpressionPart
not map[string][]*ExpressionPart
}

func (n *nullLookup) Type() EngineType {
return EngineTypeNullMatch
}

func (n *nullLookup) Match(ctx context.Context, data map[string]any) ([]*ExpressionPart, error) {
found := []*ExpressionPart{}
eg := errgroup.Group{}

for item := range n.paths {
path := item
eg.Go(func() error {
x, err := jp.ParseString(path)
if err != nil {
return err
}

res := x.Get(data)
if len(res) == 0 {
// This isn't present, which matches null in our overloads. Set the
// value to nil.
res = []any{nil}
}
// This matches null, nil (as null), and any non-null items.
found = append(found, n.Search(ctx, path, res[0])...)
return nil
})
}

return found, eg.Wait()
}

func (n *nullLookup) Search(ctx context.Context, variable string, input any) []*ExpressionPart {
if input == nil {
// The input data is null, so the only items that can match are equality
// comparisons to null.
all := n.null[variable]
return all
}

all := n.not[variable]
return all
}

func (n nullLookup) Add(ctx context.Context, p ExpressionPart) error {
func (n *nullLookup) Add(ctx context.Context, p ExpressionPart) error {
n.lock.Lock()
defer n.lock.Unlock()

varName := p.Predicate.Ident

n.paths[varName] = struct{}{}

// If we're comparing to null ("a" == null), we want the variable
// to be null and should place this in the `null` map.
//
// Any other comparison is a not-null comparison.
if p.Predicate.Operator == operators.Equals {
if _, ok := n.null[varName]; !ok {
n.null[varName] = []ExpressionPart{p}
n.null[varName] = []*ExpressionPart{&p}
return nil
}
n.null[varName] = append(n.null[varName], p)
n.null[varName] = append(n.null[varName], &p)
return nil
}

if _, ok := n.not[varName]; !ok {
n.not[varName] = []ExpressionPart{p}
n.not[varName] = []*ExpressionPart{&p}
return nil
}
n.not[varName] = append(n.not[varName], p)
n.not[varName] = append(n.not[varName], &p)
return nil
}

Expand All @@ -66,7 +117,7 @@ func (n *nullLookup) Remove(ctx context.Context, p ExpressionPart) error {

// Remove the expression part from the leaf.
for i, eval := range coll {
if p.Equals(eval) {
if p.Equals(*eval) {
coll = append(coll[:i], coll[i+1:]...)
if p.Predicate.Operator == operators.Equals {
n.null[p.Predicate.Ident] = coll
Expand All @@ -79,15 +130,3 @@ func (n *nullLookup) Remove(ctx context.Context, p ExpressionPart) error {

return ErrExpressionPartNotFound
}

func (n *nullLookup) Search(ctx context.Context, variable string, input any) []ExpressionPart {
if input == nil {
// The input data is null, so the only items that can match are equality
// comparisons to null.
all := n.null[variable]
return all
}

all := n.not[variable]
return all
}
147 changes: 147 additions & 0 deletions engine_stringmap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
package expr

import (
"context"
"fmt"
"strconv"
"sync"

"github.com/cespare/xxhash/v2"
"github.com/google/cel-go/common/operators"
"github.com/ohler55/ojg/jp"
"golang.org/x/sync/errgroup"
)

func newStringEqualityMatcher() MatchingEngine {
return &stringLookup{
lock: &sync.RWMutex{},
vars: map[string]struct{}{},
strings: map[string][]*ExpressionPart{},
}
}

// stringLookup represents a very dumb lookup for string equality matching within
// expressions.
//
// This does nothing fancy: it takes strings from expressions then adds them a hashmap.
// For any incoming event, we take all strings and store them in a hashmap pointing to
// the ExpressionPart they match.
//
// Note that strings are (obviuously) hashed to store in a hashmap, leading to potential
// false postivies. Because the aggregate merging filters invalid expressions, this is
// okay: we still evaluate potential matches at the end of filtering.
//
// Due to this, we do not care about variable names for each string. Matching on string
// equality alone down the cost of evaluating non-matchingexpressions by orders of magnitude.
type stringLookup struct {
lock *sync.RWMutex

// vars stores variable names seen within expressions.
vars map[string]struct{}
// strings stores all strings referenced within expressions, mapped to the expression part.
// this performs string equality lookups.
strings map[string][]*ExpressionPart
}

func (s stringLookup) Type() EngineType {
return EngineTypeStringHash
}

func (n *stringLookup) Match(ctx context.Context, input map[string]any) ([]*ExpressionPart, error) {
found := []*ExpressionPart{}
eg := errgroup.Group{}

for item := range n.vars {
path := item
eg.Go(func() error {
x, err := jp.ParseString(path)
if err != nil {
return err
}

res := x.Get(input)
if len(res) == 0 {
return nil
}
str, ok := res[0].(string)
if !ok {
return nil
}
// This matches null, nil (as null), and any non-null items.
found = append(found, n.Search(ctx, path, str)...)
return nil
})
}

return found, eg.Wait()
}

// Search returns all ExpressionParts which match the given input, ignoring the variable name
// entirely.
func (n *stringLookup) Search(ctx context.Context, variable string, input any) []*ExpressionPart {
n.lock.RLock()
defer n.lock.RUnlock()
str, ok := input.(string)
if !ok {
return nil
}
return n.strings[n.hash(str)]
}

// hash hashes strings quickly via xxhash. this provides a _somewhat_ collision-free
// lookup while reducing memory for strings. note that internally, go maps store the
// raw key as a string, which uses extra memory. by compressing all strings via this
// hash, memory usage grows predictably even with long strings.
func (n *stringLookup) hash(input string) string {
ui := xxhash.Sum64String(input)
return strconv.FormatUint(ui, 36)
}

func (n *stringLookup) Add(ctx context.Context, p ExpressionPart) error {
if p.Predicate.Operator != operators.Equals {
return fmt.Errorf("StringHash engines only support string equality")
}

n.lock.Lock()
defer n.lock.Unlock()
val := n.hash(p.Predicate.LiteralAsString())

n.vars[p.Predicate.Ident] = struct{}{}

if _, ok := n.strings[val]; !ok {
n.strings[val] = []*ExpressionPart{&p}
return nil
}
n.strings[val] = append(n.strings[val], &p)

return nil
}

func (n *stringLookup) Remove(ctx context.Context, p ExpressionPart) error {
if p.Predicate.Operator != operators.Equals {
return fmt.Errorf("StringHash engines only support string equality")
}

n.lock.Lock()
defer n.lock.Unlock()

val := n.hash(p.Predicate.LiteralAsString())

coll, ok := n.strings[val]
if !ok {
// This could not exist as there's nothing mapping this variable for
// the given event name.
return ErrExpressionPartNotFound
}

// Remove the expression part from the leaf.
for i, eval := range coll {
if p.Equals(*eval) {
coll = append(coll[:i], coll[i+1:]...)
n.strings[val] = coll
return nil
}
}

return ErrExpressionPartNotFound
}
Loading
Loading