Skip to content

Commit

Permalink
refactor(x/data): update data module to use orm (part 2) (#970)
Browse files Browse the repository at this point in the history
* refactor(x/data): update data module to use orm (part 2)

* minor fixes

* fix hasher tests

* fix hasher tests

* consolidate keys and utils

* remove prefix

* standalone method

* fix import

* use blake2b

* update new hash

Co-authored-by: Aaron Craelius <aaron@regen.network>
  • Loading branch information
ryanchristo and aaronc authored Apr 5, 2022
1 parent 5af6387 commit 840ba08
Show file tree
Hide file tree
Showing 16 changed files with 327 additions and 636 deletions.
6 changes: 3 additions & 3 deletions x/data/client/testsuite/grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func (s *IntegrationTestSuite) TestQueryByIRI() {
"invalid IRI",
fmt.Sprintf("%s/regen/data/v1/by-iri/%s", val.APIAddress, "foo"),
true,
"key not found",
"not found",
},
{
"valid request",
Expand Down Expand Up @@ -175,7 +175,7 @@ func (s *IntegrationTestSuite) TestQueryAttestors() {
"invalid attestor",
fmt.Sprintf("%s/regen/data/v1/attestors/%s", val.APIAddress, "foo"),
true,
"key not found",
"not found",
0,
},
{
Expand Down Expand Up @@ -289,7 +289,7 @@ func (s *IntegrationTestSuite) TestQueryResolvers() {
"invalid iri",
fmt.Sprintf("%s/regen/data/v1/resolvers/%s", val.APIAddress, "foo"),
true,
"key not found",
"not found",
0,
},
{
Expand Down
6 changes: 3 additions & 3 deletions x/data/client/testsuite/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func (s *IntegrationTestSuite) TestQueryByIRICmd() {
name: "invalid iri",
args: []string{"foo"},
expErr: true,
expErrMsg: "key not found",
expErrMsg: "invalid IRI",
},
{
name: "valid",
Expand Down Expand Up @@ -230,7 +230,7 @@ func (s *IntegrationTestSuite) TestQueryAttestorsCmd() {
name: "invalid attestor",
args: []string{"foo"},
expErr: true,
expErrMsg: "key not found",
expErrMsg: "not found",
},
{
name: "valid",
Expand Down Expand Up @@ -344,7 +344,7 @@ func (s *IntegrationTestSuite) TestQueryResolversCmd() {
name: "invalid iri",
args: []string{"abcd"},
expErr: true,
expErrMsg: "can't find",
expErrMsg: "not found",
},
{
name: "valid test",
Expand Down
2 changes: 1 addition & 1 deletion x/data/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ require (
github.com/tendermint/tendermint v0.34.15
google.golang.org/genproto v0.0.0-20220222213610-43724f9ea8cf
google.golang.org/grpc v1.44.0
google.golang.org/protobuf v1.27.1
gotest.tools/v3 v3.1.0
)

Expand Down Expand Up @@ -119,7 +120,6 @@ require (
golang.org/x/sys v0.0.0-20211004093028-2c5d950f24ef // indirect
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1 // indirect
golang.org/x/text v0.3.7 // indirect
google.golang.org/protobuf v1.27.1 // indirect
gopkg.in/ini.v1 v1.63.2 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
Expand Down
18 changes: 18 additions & 0 deletions x/data/server/hasher/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
Package hasher generates a unique binary identifier for a longer piece of binary data
using an efficient, non-cryptographic hash function.
A new Hasher instance can be created with the NewHasher() function. Advanced users can use
the NewHasherWithOptions() function to tweak the underlying parameters, but the defaults
were chosen based on testing and should provide a good balance of performance and storage
efficiency.
Shortened identifiers are generated using the idempotent Hasher.CreateID method.
Using the default algorithm which uses the first 4 bytes of a 64-bit BLAKE2b hash and then
increases the length in the case of collisions. Identifiers will be 4 bytes long in the vast
majority of cases and will sometimes be 5 and rarely 6 bytes long. In some extremely rare
cases (which have not appeared in tests), identifiers may be longer.
*/

package hasher
102 changes: 102 additions & 0 deletions x/data/server/hasher/hasher.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package hasher

import (
"encoding/binary"
"fmt"
"hash"

"golang.org/x/crypto/blake2b"
)

// Hasher generates a unique binary identifier for a longer piece of binary data
// using an efficient, non-cryptographic hash function.
type Hasher interface {
// CreateID is an idempotent method for creating a unique shortened identifier
// for the provided binary value.
CreateID(value []byte, collisions int) []byte
}

// NewHasher creates a new hasher instance. Default parameters are currently set to use the first
// 4-bytes of the 64-bit BLAKE2b, non-cryptographic hash. In the case of a collision, more bytes
// of the hash will be used for disambiguation but this happens in a minority of cases except
// for massively large data sets.
func NewHasher() (Hasher, error) {
return NewHasherWithOptions(HashOptions{})
}

// NewHasherWithOptions creates a Hash with custom options. Most users should just use NewHasher
// with the default values.
func NewHasherWithOptions(options HashOptions) (Hasher, error) {
minLength := options.MinLength
if minLength == 0 {
minLength = 4
}

newHash := options.NewHash
if newHash == nil {
newHash = func() hash.Hash {
hash, err := blake2b.New(8, nil)
if err != nil {
panic(err) // an error should not occur creating a hash
}
return hash
}
}

hashLen := len(newHash().Sum(nil))
if minLength > hashLen {
return nil, fmt.Errorf("option MinLength %d is greater than hash length %d", minLength, hashLen)
}

bufLen := hashLen + binary.MaxVarintLen64

return hasher{
minLen: minLength,
bufLen: bufLen,
newHash: newHash,
hashLen: hashLen,
}, nil
}

// HashOptions is used to specify custom hash options and should only be used by advanced users.
type HashOptions struct {
// NewHash is a function which returns a new hash.Hash instance.
NewHash func() hash.Hash

// MinLength is the minimum number of hash bytes that will be used to create a lookup identifier.
MinLength int
}

type hasher struct {
minLen int
bufLen int
newHash func() hash.Hash
hashLen int
}

func (t hasher) CreateID(value []byte, collisions int) (id []byte) {
hasher := t.newHash()
_, err := hasher.Write(value)
if err != nil {
// we panic here because hash.Write returning an error shouldn't happen
panic(err)
}
hashBz := hasher.Sum(nil)

id = make([]byte, t.minLen, t.bufLen)
copy(id[:], hashBz[:t.minLen])

// Deal with collisions by appending the equivalent number of bytes
// from hashBz. If using this method will exceed hash length, append
// a disambiguation varint. Such collisions are almost impossible with
// good settings, but can happen with a suboptimal hash function.
if t.minLen+collisions < t.hashLen {
id = append(id, hashBz[collisions])
} else {
id = id[:t.bufLen]
n := binary.PutUvarint(id[t.hashLen:], uint64(collisions))
id = id[:t.hashLen+n]
}

return id
}
109 changes: 109 additions & 0 deletions x/data/server/hasher/hasher_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package hasher

import (
"fmt"
"hash"
"math"
"testing"

"github.com/stretchr/testify/require"
"golang.org/x/crypto/blake2b"

"github.com/cosmos/cosmos-sdk/store/mem"
"github.com/tendermint/tendermint/libs/rand"
)

func TestHasher(t *testing.T) {
// test default case with good params
hasher, err := NewHasher()
require.NoError(t, err)
testHasher(t, hasher, 5)

// test suboptimal case to trigger varint fallback
hasher, err = NewHasherWithOptions(HashOptions{
MinLength: 1,
NewHash: func() hash.Hash {
hash, err := blake2b.New(8, nil)
if err != nil {
panic(err) // an error should not occur creating a hash
}
return sixteenBitHash{
hash,
}
},
})
require.NoError(t, err)
testHasher(t, hasher, 5)
}

type sixteenBitHash struct {
hash.Hash
}

func (h sixteenBitHash) Sum(b []byte) []byte {
bz := h.Hash.Sum(b)
// just return b + the first three bytes
return bz[:len(b)+3]
}

func testHasher(t *testing.T, h Hasher, k int) {
hasher := h.(hasher)
store := mem.NewStore()
n := int(math.Pow10(k))
data := make([][]byte, n)
values := map[string]bool{}
ids := map[int][]byte{}
totalCollisions := 0
secondaryCollisions := 0

for i := 0; i < n; i++ {
var value []byte
var valueStr string
for {
m := rand.Int31n(256)
value = rand.Bytes(int(m))
valueStr = fmt.Sprintf("%x", value)
if !values[valueStr] {
break
}
}
data[i] = value
values[valueStr] = true

c := 0
for ; ; c++ {
id := hasher.CreateID(value, c)
v := store.Get(id)
if len(v) == 0 {
ids[i] = id
store.Set(id, value)
break
}
}
if c > 1 {
totalCollisions += 1
}
if c > 2 {
secondaryCollisions += 1
}
}

t.Logf("total collisions: %d / %.0e, secondary collisions: %d, collision rate: %.4f%%", totalCollisions, float64(n), secondaryCollisions, float64(totalCollisions)/float64(n)*100.0)

store = mem.NewStore()

for i := 0; i < n; i++ {
id := ids[i]
value := data[i]

for c := 0; ; c++ {
newId := hasher.CreateID(value, c)
v := store.Get(newId)
if len(v) == 0 {
store.Set(newId, value)
require.Equal(t, id, newId)
break
}
}
}
}
65 changes: 0 additions & 65 deletions x/data/server/keys.go

This file was deleted.

21 changes: 0 additions & 21 deletions x/data/server/lookup/doc.go

This file was deleted.

Loading

0 comments on commit 840ba08

Please sign in to comment.