-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor(x/data): update data module to use orm (part 2) (#970)
* refactor(x/data): update data module to use orm (part 2) * minor fixes * fix hasher tests * fix hasher tests * consolidate keys and utils * remove prefix * standalone method * fix import * use blake2b * update new hash Co-authored-by: Aaron Craelius <aaron@regen.network>
- Loading branch information
1 parent
5af6387
commit 840ba08
Showing
16 changed files
with
327 additions
and
636 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
/* | ||
Package hasher generates a unique binary identifier for a longer piece of binary data | ||
using an efficient, non-cryptographic hash function. | ||
A new Hasher instance can be created with the NewHasher() function. Advanced users can use | ||
the NewHasherWithOptions() function to tweak the underlying parameters, but the defaults | ||
were chosen based on testing and should provide a good balance of performance and storage | ||
efficiency. | ||
Shortened identifiers are generated using the idempotent Hasher.CreateID method. | ||
Using the default algorithm which uses the first 4 bytes of a 64-bit BLAKE2b hash and then | ||
increases the length in the case of collisions. Identifiers will be 4 bytes long in the vast | ||
majority of cases and will sometimes be 5 and rarely 6 bytes long. In some extremely rare | ||
cases (which have not appeared in tests), identifiers may be longer. | ||
*/ | ||
|
||
package hasher |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
package hasher | ||
|
||
import ( | ||
"encoding/binary" | ||
"fmt" | ||
"hash" | ||
|
||
"golang.org/x/crypto/blake2b" | ||
) | ||
|
||
// Hasher generates a unique binary identifier for a longer piece of binary data | ||
// using an efficient, non-cryptographic hash function. | ||
type Hasher interface { | ||
// CreateID is an idempotent method for creating a unique shortened identifier | ||
// for the provided binary value. | ||
CreateID(value []byte, collisions int) []byte | ||
} | ||
|
||
// NewHasher creates a new hasher instance. Default parameters are currently set to use the first | ||
// 4-bytes of the 64-bit BLAKE2b, non-cryptographic hash. In the case of a collision, more bytes | ||
// of the hash will be used for disambiguation but this happens in a minority of cases except | ||
// for massively large data sets. | ||
func NewHasher() (Hasher, error) { | ||
return NewHasherWithOptions(HashOptions{}) | ||
} | ||
|
||
// NewHasherWithOptions creates a Hash with custom options. Most users should just use NewHasher | ||
// with the default values. | ||
func NewHasherWithOptions(options HashOptions) (Hasher, error) { | ||
minLength := options.MinLength | ||
if minLength == 0 { | ||
minLength = 4 | ||
} | ||
|
||
newHash := options.NewHash | ||
if newHash == nil { | ||
newHash = func() hash.Hash { | ||
hash, err := blake2b.New(8, nil) | ||
if err != nil { | ||
panic(err) // an error should not occur creating a hash | ||
} | ||
return hash | ||
} | ||
} | ||
|
||
hashLen := len(newHash().Sum(nil)) | ||
if minLength > hashLen { | ||
return nil, fmt.Errorf("option MinLength %d is greater than hash length %d", minLength, hashLen) | ||
} | ||
|
||
bufLen := hashLen + binary.MaxVarintLen64 | ||
|
||
return hasher{ | ||
minLen: minLength, | ||
bufLen: bufLen, | ||
newHash: newHash, | ||
hashLen: hashLen, | ||
}, nil | ||
} | ||
|
||
// HashOptions is used to specify custom hash options and should only be used by advanced users. | ||
type HashOptions struct { | ||
// NewHash is a function which returns a new hash.Hash instance. | ||
NewHash func() hash.Hash | ||
|
||
// MinLength is the minimum number of hash bytes that will be used to create a lookup identifier. | ||
MinLength int | ||
} | ||
|
||
type hasher struct { | ||
minLen int | ||
bufLen int | ||
newHash func() hash.Hash | ||
hashLen int | ||
} | ||
|
||
func (t hasher) CreateID(value []byte, collisions int) (id []byte) { | ||
hasher := t.newHash() | ||
_, err := hasher.Write(value) | ||
if err != nil { | ||
// we panic here because hash.Write returning an error shouldn't happen | ||
panic(err) | ||
} | ||
hashBz := hasher.Sum(nil) | ||
|
||
id = make([]byte, t.minLen, t.bufLen) | ||
copy(id[:], hashBz[:t.minLen]) | ||
|
||
// Deal with collisions by appending the equivalent number of bytes | ||
// from hashBz. If using this method will exceed hash length, append | ||
// a disambiguation varint. Such collisions are almost impossible with | ||
// good settings, but can happen with a suboptimal hash function. | ||
if t.minLen+collisions < t.hashLen { | ||
id = append(id, hashBz[collisions]) | ||
} else { | ||
id = id[:t.bufLen] | ||
n := binary.PutUvarint(id[t.hashLen:], uint64(collisions)) | ||
id = id[:t.hashLen+n] | ||
} | ||
|
||
return id | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
package hasher | ||
|
||
import ( | ||
"fmt" | ||
"hash" | ||
"math" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
"golang.org/x/crypto/blake2b" | ||
|
||
"github.com/cosmos/cosmos-sdk/store/mem" | ||
"github.com/tendermint/tendermint/libs/rand" | ||
) | ||
|
||
func TestHasher(t *testing.T) { | ||
// test default case with good params | ||
hasher, err := NewHasher() | ||
require.NoError(t, err) | ||
testHasher(t, hasher, 5) | ||
|
||
// test suboptimal case to trigger varint fallback | ||
hasher, err = NewHasherWithOptions(HashOptions{ | ||
MinLength: 1, | ||
NewHash: func() hash.Hash { | ||
hash, err := blake2b.New(8, nil) | ||
if err != nil { | ||
panic(err) // an error should not occur creating a hash | ||
} | ||
return sixteenBitHash{ | ||
hash, | ||
} | ||
}, | ||
}) | ||
require.NoError(t, err) | ||
testHasher(t, hasher, 5) | ||
} | ||
|
||
type sixteenBitHash struct { | ||
hash.Hash | ||
} | ||
|
||
func (h sixteenBitHash) Sum(b []byte) []byte { | ||
bz := h.Hash.Sum(b) | ||
// just return b + the first three bytes | ||
return bz[:len(b)+3] | ||
} | ||
|
||
func testHasher(t *testing.T, h Hasher, k int) { | ||
hasher := h.(hasher) | ||
store := mem.NewStore() | ||
n := int(math.Pow10(k)) | ||
data := make([][]byte, n) | ||
values := map[string]bool{} | ||
ids := map[int][]byte{} | ||
totalCollisions := 0 | ||
secondaryCollisions := 0 | ||
|
||
for i := 0; i < n; i++ { | ||
var value []byte | ||
var valueStr string | ||
for { | ||
m := rand.Int31n(256) | ||
value = rand.Bytes(int(m)) | ||
valueStr = fmt.Sprintf("%x", value) | ||
if !values[valueStr] { | ||
break | ||
} | ||
} | ||
data[i] = value | ||
values[valueStr] = true | ||
|
||
c := 0 | ||
for ; ; c++ { | ||
id := hasher.CreateID(value, c) | ||
v := store.Get(id) | ||
if len(v) == 0 { | ||
ids[i] = id | ||
store.Set(id, value) | ||
break | ||
} | ||
} | ||
if c > 1 { | ||
totalCollisions += 1 | ||
} | ||
if c > 2 { | ||
secondaryCollisions += 1 | ||
} | ||
} | ||
|
||
t.Logf("total collisions: %d / %.0e, secondary collisions: %d, collision rate: %.4f%%", totalCollisions, float64(n), secondaryCollisions, float64(totalCollisions)/float64(n)*100.0) | ||
|
||
store = mem.NewStore() | ||
|
||
for i := 0; i < n; i++ { | ||
id := ids[i] | ||
value := data[i] | ||
|
||
for c := 0; ; c++ { | ||
newId := hasher.CreateID(value, c) | ||
v := store.Get(newId) | ||
if len(v) == 0 { | ||
store.Set(newId, value) | ||
require.Equal(t, id, newId) | ||
break | ||
} | ||
} | ||
} | ||
} |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.