Skip to content

Commit

Permalink
Issue 49 (suggest-go#50)
Browse files Browse the repository at this point in the history
* issue-49 Update roaring lib

* issue-49 Move sorted array to separate file + add factory method

* issue-49 add go 1.14
  • Loading branch information
alldroll authored Jul 19, 2020
1 parent b5a55b7 commit 3e17bfd
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 185 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ language: go

go:
- "1.13.x"
- "1.14.x"

script:
- make vet
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/suggest-go/suggest
go 1.13

require (
github.com/RoaringBitmap/roaring v0.4.23
github.com/RoaringBitmap/roaring v0.5.0
github.com/alldroll/cdb v1.0.2
github.com/alldroll/go-datastructures v0.0.0-20190322060030-1d3a19ff3b29
github.com/edsrzf/mmap-go v0.0.0-20190108065903-904c4ced31cd
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
github.com/RoaringBitmap/roaring v0.4.23 h1:gpyfd12QohbqhFO4NVDUdoPOCXsyahYRQhINmlHxKeo=
github.com/RoaringBitmap/roaring v0.4.23/go.mod h1:D0gp8kJQgE1A4LQ5wFLggQEyvDi06Mq5mKs52e1TwOo=
github.com/RoaringBitmap/roaring v0.5.0 h1:0psZZWU0J2AUl29BAylpHAsuBEEhCEfTKl2v5yHtXIg=
github.com/RoaringBitmap/roaring v0.5.0/go.mod h1:D0gp8kJQgE1A4LQ5wFLggQEyvDi06Mq5mKs52e1TwOo=
github.com/alldroll/cdb v1.0.2 h1:pSB3BphsF0m2DqOZm+IFyNm38nz1R8kCg3DPCusPLQE=
github.com/alldroll/cdb v1.0.2/go.mod h1:PK3VAN9pconusJqa4kzOupYg9QxOnmgU8AcBWhuZZdo=
github.com/alldroll/go-datastructures v0.0.0-20190322060030-1d3a19ff3b29 h1:gKZgtn2ud0FxyG0lFqrp8hRnvABoqkDTNgebDlbmtgM=
Expand Down
5 changes: 3 additions & 2 deletions pkg/lm/ngram_reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ import (
"bufio"
"errors"
"fmt"
"github.com/suggest-go/suggest/pkg/store"
"strconv"
"strings"

"github.com/suggest-go/suggest/pkg/store"
)

// NGramReader is responsible for creating NGramModel from the files
Expand Down Expand Up @@ -40,7 +41,7 @@ func (gr *googleNGramFormatReader) Read() (NGramModel, error) {
vectors := make([]NGramVector, 0, int(gr.nGramOrder))

for i := 0; i < int(gr.nGramOrder); i++ {
builder := NewNGramVectorBuilder(vectors)
builder := NewNGramVectorBuilder(vectors, CreateSortedArray)

if err := gr.readNGramVector(builder, i+1); err != nil {
return nil, fmt.Errorf("failed to read %d ngram vector: %w", i+1, err)
Expand Down
163 changes: 1 addition & 162 deletions pkg/lm/ngram_vector.go
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
package lm

import (
"bytes"
"encoding/binary"
"encoding/gob"
"errors"
"fmt"
"log"
"math"
"sort"
"strconv"

"github.com/suggest-go/suggest/pkg/utils"
)

type (
// ContextOffset represents the id of parent nGram path
ContextOffset = uint32
key = uint64
key = uint64
)

// NGramVector represents one level of nGram trie
Expand All @@ -43,155 +34,3 @@ var (
// ErrContextOverflow tells that it was an attempt
ErrContextOverflow = errors.New("out of maxContextOffset")
)

type sortedArray struct {
keys []key
values []WordCount
total WordCount
}

// GetCount returns WordCount and Node ContextOffset for the given pair (word, context)
func (s *sortedArray) GetCount(word WordID, context ContextOffset) (WordCount, ContextOffset) {
key := makeKey(word, context)
i := s.find(key)

if InvalidContextOffset == i {
return 0, InvalidContextOffset
}

return s.values[int(i)], i
}

// GetContextOffset returns the given node context offset
func (s *sortedArray) GetContextOffset(word WordID, context ContextOffset) ContextOffset {
key := makeKey(word, context)

return s.find(key)
}

// CorpusCount returns size of all counts in the collection
func (s *sortedArray) CorpusCount() WordCount {
return s.total
}

// SubVector returns NGramVector for the given context
func (s *sortedArray) SubVector(context ContextOffset) NGramVector {
minChild := makeKey(0, context)
maxChild := makeKey(maxContextOffset-2, context)

i := sort.Search(len(s.keys), func(i int) bool { return s.keys[i] >= minChild })

if i < 0 || i >= len(s.keys) {
return nil
}

j := sort.Search(len(s.keys)-i, func(j int) bool { return s.keys[j+i] >= maxChild })

return &sortedArray{
keys: s.keys[i:i+j],
values: s.values[i:i+j],
total: s.total,
}
}

// MarshalBinary encodes the receiver into a binary form and returns the result.
func (s *sortedArray) MarshalBinary() ([]byte, error) {
var result bytes.Buffer

encodedKeys := make([]byte, len(s.keys)*binary.MaxVarintLen64)
prevKey := uint64(0)
keyEndPos := 0

// performs delta encoding
for _, el := range s.keys {
keyEndPos += binary.PutUvarint(encodedKeys[keyEndPos:], el-prevKey)
prevKey = el
}

valEndPos := len(s.values) * 4
encodedValues := make([]byte, valEndPos)

for i, el := range s.values {
binary.LittleEndian.PutUint32(encodedValues[i*4:(i+1)*4], el)
}

// allocate buffer capacity
result.Grow(keyEndPos + valEndPos + 4 + strconv.IntSize*2)

// write header
if _, err := fmt.Fprintln(&result, keyEndPos, valEndPos, s.total); err != nil {
return nil, err
}

// write data
result.Write(encodedKeys[:keyEndPos])
result.Write(encodedValues)

return result.Bytes(), nil
}

// UnmarshalBinary decodes the binary form
func (s *sortedArray) UnmarshalBinary(data []byte) error {
buf := bytes.NewBuffer(data)
keySize, valSize := 0, 0

if _, err := fmt.Fscanln(buf, &keySize, &valSize, &s.total); err != nil {
return err
}

n := 0
keyEndPos := 0
encodedKeys := buf.Next(keySize)
s.keys = make([]key, valSize/4)
prev := uint64(0)

// 0, 1, 3, 6, 7 -> 0, 1, 4, 10, 17 (delta decoding)
for i := 0; i < len(s.keys); i++ {
s.keys[i], n = binary.Uvarint(encodedKeys[keyEndPos:])
s.keys[i] += prev
prev = s.keys[i]
keyEndPos += n
}

encodedValues := buf.Next(valSize)
s.values = make([]WordCount, valSize/4)

for i := 0; i < len(s.values); i++ {
s.values[i] = binary.LittleEndian.Uint32(encodedValues[i*4 : (i+1)*4])
}

return nil
}

// find finds the given key in the collection. Returns ContextOffset if the key exists, otherwise returns InvalidContextOffset
func (s *sortedArray) find(key uint64) ContextOffset {
if len(s.keys) == 0 || s.keys[0] > key || s.keys[len(s.keys) - 1] < key {
return InvalidContextOffset
}

i := sort.Search(len(s.keys), func(i int) bool { return s.keys[i] >= key })

if i < 0 || i >= len(s.keys) || s.keys[i] != key {
return InvalidContextOffset
}

return ContextOffset(i)
}

// makeKey creates uint64 key for the given pair (word, context)
func makeKey(word WordID, context ContextOffset) key {
if context > maxContextOffset {
log.Fatal(ErrContextOverflow)
}

return utils.Pack(context, word)
}

// getWordID returns the word id for the given key
func getWordID(key key) WordID {
return utils.UnpackRight(key)
}

func init() {
gob.Register(&sortedArray{})
}
25 changes: 7 additions & 18 deletions pkg/lm/ngram_vector_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,24 @@ type NGramVectorBuilder interface {
Build() NGramVector
}

// NGramVectorFactory represents a factory method for creating a NGramVector instance.
type NGramVectorFactory func(tree rbtree.Tree) NGramVector

// ErrNGramOrderIsOutOfRange informs that the given NGrams is out of range for the given
var ErrNGramOrderIsOutOfRange = errors.New("nGrams order is out of range")

// nGramVectorBuilder implements NGramVectorBuilder interface
type nGramVectorBuilder struct {
parents []NGramVector
factory NGramVectorFactory
tree rbtree.Tree
}

// NewNGramVectorBuilder creates new instance of NGramVectorBuilder
func NewNGramVectorBuilder(parents []NGramVector) NGramVectorBuilder {
func NewNGramVectorBuilder(parents []NGramVector, factory NGramVectorFactory) NGramVectorBuilder {
return &nGramVectorBuilder{
parents: parents,
factory: factory,
tree: rbtree.New(),
}
}
Expand Down Expand Up @@ -66,23 +71,7 @@ func (m *nGramVectorBuilder) Put(nGrams []WordID, count WordCount) error {

// Build creates new instance of NGramVector
func (m *nGramVectorBuilder) Build() NGramVector {
var node *nGramNode
keys := make([]uint64, 0, m.tree.Len())
values := make([]WordCount, 0, m.tree.Len())
total := WordCount(0)

for iter := m.tree.NewIterator(); iter.Next() != nil; {
node = iter.Get().(*nGramNode)
keys = append(keys, node.key)
values = append(values, node.value)
total += node.value
}

return &sortedArray{
keys: keys,
values: values,
total: total,
}
return m.factory(m.tree)
}

// nGramNode represents tree node for the given nGram
Expand Down
Loading

0 comments on commit 3e17bfd

Please sign in to comment.