refactor(x/data): update data module to use orm (part 2) (#970)

* refactor(x/data): update data module to use orm (part 2) * minor fixes * fix hasher tests * fix hasher tests * consolidate keys and utils * remove prefix * standalone method * fix import * use blake2b * update new hash Co-authored-by: Aaron Craelius <aaron@regen.network>
regen-network · Apr 5, 2022 · 840ba08 · 840ba08
1 parent 5af6387
commit 840ba08
Show file tree

Hide file tree

Showing 16 changed files with 327 additions and 636 deletions.
diff --git a/x/data/client/testsuite/grpc.go b/x/data/client/testsuite/grpc.go
@@ -23,7 +23,7 @@ func (s *IntegrationTestSuite) TestQueryByIRI() {
 			"invalid IRI",
 			fmt.Sprintf("%s/regen/data/v1/by-iri/%s", val.APIAddress, "foo"),
 			true,
-			"key not found",
+			"not found",
 		},
 		{
 			"valid request",
@@ -175,7 +175,7 @@ func (s *IntegrationTestSuite) TestQueryAttestors() {
 			"invalid attestor",
 			fmt.Sprintf("%s/regen/data/v1/attestors/%s", val.APIAddress, "foo"),
 			true,
-			"key not found",
+			"not found",
 			0,
 		},
 		{
@@ -289,7 +289,7 @@ func (s *IntegrationTestSuite) TestQueryResolvers() {
 			"invalid iri",
 			fmt.Sprintf("%s/regen/data/v1/resolvers/%s", val.APIAddress, "foo"),
 			true,
-			"key not found",
+			"not found",
 			0,
 		},
 		{

diff --git a/x/data/client/testsuite/query.go b/x/data/client/testsuite/query.go
@@ -36,7 +36,7 @@ func (s *IntegrationTestSuite) TestQueryByIRICmd() {
 			name:      "invalid iri",
 			args:      []string{"foo"},
 			expErr:    true,
-			expErrMsg: "key not found",
+			expErrMsg: "invalid IRI",
 		},
 		{
 			name:   "valid",
@@ -230,7 +230,7 @@ func (s *IntegrationTestSuite) TestQueryAttestorsCmd() {
 			name:      "invalid attestor",
 			args:      []string{"foo"},
 			expErr:    true,
-			expErrMsg: "key not found",
+			expErrMsg: "not found",
 		},
 		{
 			name:   "valid",
@@ -344,7 +344,7 @@ func (s *IntegrationTestSuite) TestQueryResolversCmd() {
 			name:      "invalid iri",
 			args:      []string{"abcd"},
 			expErr:    true,
-			expErrMsg: "can't find",
+			expErrMsg: "not found",
 		},
 		{
 			name:   "valid test",

diff --git a/x/data/go.mod b/x/data/go.mod
@@ -18,6 +18,7 @@ require (
 	github.com/tendermint/tendermint v0.34.15
 	google.golang.org/genproto v0.0.0-20220222213610-43724f9ea8cf
 	google.golang.org/grpc v1.44.0
+	google.golang.org/protobuf v1.27.1
 	gotest.tools/v3 v3.1.0
 )
 
@@ -119,7 +120,6 @@ require (
 	golang.org/x/sys v0.0.0-20211004093028-2c5d950f24ef // indirect
 	golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1 // indirect
 	golang.org/x/text v0.3.7 // indirect
-	google.golang.org/protobuf v1.27.1 // indirect
 	gopkg.in/ini.v1 v1.63.2 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect

diff --git a/x/data/server/hasher/doc.go b/x/data/server/hasher/doc.go
@@ -0,0 +1,18 @@
+/*
+Package hasher generates a unique binary identifier for a longer piece of binary data
+using an efficient, non-cryptographic hash function.
+
+A new Hasher instance can be created with the NewHasher() function. Advanced users can use
+the NewHasherWithOptions() function to tweak the underlying parameters, but the defaults
+were chosen based on testing and should provide a good balance of performance and storage
+efficiency.
+
+Shortened identifiers are generated using the idempotent Hasher.CreateID method.
+
+Using the default algorithm which uses the first 4 bytes of a 64-bit BLAKE2b hash and then
+increases the length in the case of collisions. Identifiers will be 4 bytes long in the vast
+majority of cases and will sometimes be 5 and rarely 6 bytes long. In some extremely rare
+cases (which have not appeared in tests), identifiers may be longer.
+*/
+
+package hasher
diff --git a/x/data/server/hasher/hasher.go b/x/data/server/hasher/hasher.go
@@ -0,0 +1,102 @@
+package hasher
+
+import (
+	"encoding/binary"
+	"fmt"
+	"hash"
+
+	"golang.org/x/crypto/blake2b"
+)
+
+// Hasher generates a unique binary identifier for a longer piece of binary data
+// using an efficient, non-cryptographic hash function.
+type Hasher interface {
+	// CreateID is an idempotent method for creating a unique shortened identifier
+	// for the provided binary value.
+	CreateID(value []byte, collisions int) []byte
+}
+
+// NewHasher creates a new hasher instance. Default parameters are currently set to use the first
+// 4-bytes of the 64-bit BLAKE2b, non-cryptographic hash. In the case of a collision, more bytes
+// of the hash will be used for disambiguation but this happens in a minority of cases except
+// for massively large data sets.
+func NewHasher() (Hasher, error) {
+	return NewHasherWithOptions(HashOptions{})
+}
+
+// NewHasherWithOptions creates a Hash with custom options. Most users should just use NewHasher
+// with the default values.
+func NewHasherWithOptions(options HashOptions) (Hasher, error) {
+	minLength := options.MinLength
+	if minLength == 0 {
+		minLength = 4
+	}
+
+	newHash := options.NewHash
+	if newHash == nil {
+		newHash = func() hash.Hash {
+			hash, err := blake2b.New(8, nil)
+			if err != nil {
+				panic(err) // an error should not occur creating a hash
+			}
+			return hash
+		}
+	}
+
+	hashLen := len(newHash().Sum(nil))
+	if minLength > hashLen {
+		return nil, fmt.Errorf("option MinLength %d is greater than hash length %d", minLength, hashLen)
+	}
+
+	bufLen := hashLen + binary.MaxVarintLen64
+
+	return hasher{
+		minLen:  minLength,
+		bufLen:  bufLen,
+		newHash: newHash,
+		hashLen: hashLen,
+	}, nil
+}
+
+// HashOptions is used to specify custom hash options and should only be used by advanced users.
+type HashOptions struct {
+	// NewHash is a function which returns a new hash.Hash instance.
+	NewHash func() hash.Hash
+
+	// MinLength is the minimum number of hash bytes that will be used to create a lookup identifier.
+	MinLength int
+}
+
+type hasher struct {
+	minLen  int
+	bufLen  int
+	newHash func() hash.Hash
+	hashLen int
+}
+
+func (t hasher) CreateID(value []byte, collisions int) (id []byte) {
+	hasher := t.newHash()
+	_, err := hasher.Write(value)
+	if err != nil {
+		// we panic here because hash.Write returning an error shouldn't happen
+		panic(err)
+	}
+	hashBz := hasher.Sum(nil)
+
+	id = make([]byte, t.minLen, t.bufLen)
+	copy(id[:], hashBz[:t.minLen])
+
+	// Deal with collisions by appending the equivalent number of bytes
+	// from hashBz. If using this method will exceed hash length, append
+	// a disambiguation varint. Such collisions are almost impossible with
+	// good settings, but can happen with a suboptimal hash function.
+	if t.minLen+collisions < t.hashLen {
+		id = append(id, hashBz[collisions])
+	} else {
+		id = id[:t.bufLen]
+		n := binary.PutUvarint(id[t.hashLen:], uint64(collisions))
+		id = id[:t.hashLen+n]
+	}
+
+	return id
+}
diff --git a/x/data/server/hasher/hasher_test.go b/x/data/server/hasher/hasher_test.go
@@ -0,0 +1,109 @@
+package hasher
+
+import (
+	"fmt"
+	"hash"
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	"golang.org/x/crypto/blake2b"
+
+	"github.com/cosmos/cosmos-sdk/store/mem"
+	"github.com/tendermint/tendermint/libs/rand"
+)
+
+func TestHasher(t *testing.T) {
+	// test default case with good params
+	hasher, err := NewHasher()
+	require.NoError(t, err)
+	testHasher(t, hasher, 5)
+
+	// test suboptimal case to trigger varint fallback
+	hasher, err = NewHasherWithOptions(HashOptions{
+		MinLength: 1,
+		NewHash: func() hash.Hash {
+			hash, err := blake2b.New(8, nil)
+			if err != nil {
+				panic(err) // an error should not occur creating a hash
+			}
+			return sixteenBitHash{
+				hash,
+			}
+		},
+	})
+	require.NoError(t, err)
+	testHasher(t, hasher, 5)
+}
+
+type sixteenBitHash struct {
+	hash.Hash
+}
+
+func (h sixteenBitHash) Sum(b []byte) []byte {
+	bz := h.Hash.Sum(b)
+	// just return b + the first three bytes
+	return bz[:len(b)+3]
+}
+
+func testHasher(t *testing.T, h Hasher, k int) {
+	hasher := h.(hasher)
+	store := mem.NewStore()
+	n := int(math.Pow10(k))
+	data := make([][]byte, n)
+	values := map[string]bool{}
+	ids := map[int][]byte{}
+	totalCollisions := 0
+	secondaryCollisions := 0
+
+	for i := 0; i < n; i++ {
+		var value []byte
+		var valueStr string
+		for {
+			m := rand.Int31n(256)
+			value = rand.Bytes(int(m))
+			valueStr = fmt.Sprintf("%x", value)
+			if !values[valueStr] {
+				break
+			}
+		}
+		data[i] = value
+		values[valueStr] = true
+
+		c := 0
+		for ; ; c++ {
+			id := hasher.CreateID(value, c)
+			v := store.Get(id)
+			if len(v) == 0 {
+				ids[i] = id
+				store.Set(id, value)
+				break
+			}
+		}
+		if c > 1 {
+			totalCollisions += 1
+		}
+		if c > 2 {
+			secondaryCollisions += 1
+		}
+	}
+
+	t.Logf("total collisions: %d / %.0e, secondary collisions: %d, collision rate: %.4f%%", totalCollisions, float64(n), secondaryCollisions, float64(totalCollisions)/float64(n)*100.0)
+
+	store = mem.NewStore()
+
+	for i := 0; i < n; i++ {
+		id := ids[i]
+		value := data[i]
+
+		for c := 0; ; c++ {
+			newId := hasher.CreateID(value, c)
+			v := store.Get(newId)
+			if len(v) == 0 {
+				store.Set(newId, value)
+				require.Equal(t, id, newId)
+				break
+			}
+		}
+	}
+}
diff --git a/x/data/server/keys.go b/x/data/server/keys.go
diff --git a/x/data/server/lookup/doc.go b/x/data/server/lookup/doc.go