Skip to content
This repository has been archived by the owner on Jun 27, 2023. It is now read-only.

Tests for unsharding PR #99

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions hamt/hamt.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,24 @@ import (
"fmt"
"os"

format "github.com/ipfs/go-unixfs"
"github.com/ipfs/go-unixfs/internal"

bitfield "github.com/ipfs/go-bitfield"
cid "github.com/ipfs/go-cid"
ipld "github.com/ipfs/go-ipld-format"
dag "github.com/ipfs/go-merkledag"
format "github.com/ipfs/go-unixfs"
)

const (
// HashMurmur3 is the multiformats identifier for Murmur3
HashMurmur3 uint64 = 0x22
)

func init() {
internal.HAMTHashFunction = murmur3Hash
}

func (ds *Shard) isValueNode() bool {
return ds.key != "" && ds.val != nil
}
Expand All @@ -45,17 +51,29 @@ func (ds *Shard) isValueNode() bool {
type Shard struct {
childer *childer

tableSize int
// Entries per node (number of possible childs indexed by the partial key).
schomatis marked this conversation as resolved.
Show resolved Hide resolved
tableSize int
// Bits needed to encode child indexes (log2 of number of entries). This is
// the number of bits taken from the hash key on each level of the tree.
tableSizeLg2 int

builder cid.Builder
hashFunc uint64

// String format with number of zeros that will be present in the hexadecimal
// encoding of the child index to always reach the fixed maxpadlen chars.
// Example: maxpadlen = 4 => prefixPadStr: "%04X" (print number in hexadecimal
// format padding with zeros to always reach 4 characters).
prefixPadStr string
maxpadlen int
// Length in chars of string that encodes child indexes. We encode indexes
// as hexadecimal strings to this is log4 of number of entries.
maxpadlen int

dserv ipld.DAGService

// FIXME: Remove. We don't actually store "value nodes". This confusing
// abstraction just removes the maxpadlen from the link names to extract
// the actual value link the trie is storing.
Comment on lines +74 to +76
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding of your comment here is that val.Name and maxpadlen can be used to compute key and so we shouldn't bother caching it here and just recompute it when needed. Is that correct?

Also, can you clarify if this a FIXME intended for this set of PRs?

// leaf node
key string
val *ipld.Link
Expand All @@ -68,12 +86,13 @@ func NewShard(dserv ipld.DAGService, size int) (*Shard, error) {
return nil, err
}

// FIXME: Make this at least a static configuration for testing.
ds.hashFunc = HashMurmur3
return ds, nil
}

func makeShard(ds ipld.DAGService, size int) (*Shard, error) {
lg2s, err := logtwo(size)
lg2s, err := Logtwo(size)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -211,7 +230,7 @@ func (ds *Shard) Set(ctx context.Context, name string, nd ipld.Node) error {
// name key in this Shard or its children. It also returns the previous link
// under that name key (if any).
func (ds *Shard) SetAndPrevious(ctx context.Context, name string, node ipld.Node) (*ipld.Link, error) {
hv := &hashBits{b: hash([]byte(name))}
hv := newHashBits(name)
err := ds.dserv.Add(ctx, node)
if err != nil {
return nil, err
Expand All @@ -221,6 +240,9 @@ func (ds *Shard) SetAndPrevious(ctx context.Context, name string, node ipld.Node
if err != nil {
return nil, err
}

// FIXME: We don't need to set the name here, it will get overwritten.
// This is confusing, confirm and remove this line.
lnk.Name = ds.linkNamePrefix(0) + name

return ds.setValue(ctx, hv, name, lnk)
Expand All @@ -236,13 +258,13 @@ func (ds *Shard) Remove(ctx context.Context, name string) error {
// RemoveAndPrevious is similar to the public Remove but also returns the
// old removed link (if it exists).
func (ds *Shard) RemoveAndPrevious(ctx context.Context, name string) (*ipld.Link, error) {
hv := &hashBits{b: hash([]byte(name))}
hv := newHashBits(name)
return ds.setValue(ctx, hv, name, nil)
}

// Find searches for a child node by 'name' within this hamt
func (ds *Shard) Find(ctx context.Context, name string) (*ipld.Link, error) {
hv := &hashBits{b: hash([]byte(name))}
hv := newHashBits(name)

var out *ipld.Link
err := ds.getValue(ctx, hv, name, func(sv *Shard) error {
Expand Down Expand Up @@ -489,10 +511,7 @@ func (ds *Shard) setValue(ctx context.Context, hv *hashBits, key string, value *
return nil, err
}
child.builder = ds.builder
chhv := &hashBits{
b: hash([]byte(grandChild.key)),
consumed: hv.consumed,
}
chhv := newConsumedHashBits(grandChild.key, hv.consumed)

// We explicitly ignore the oldValue returned by the next two insertions
// (which will be nil) to highlight there is no overwrite here: they are
Expand Down
18 changes: 15 additions & 3 deletions hamt/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ package hamt

import (
"fmt"
"math/bits"

"github.com/ipfs/go-unixfs/internal"

"github.com/spaolacci/murmur3"
"math/bits"
)

// hashBits is a helper that allows the reading of the 'next n bits' as an integer.
Expand All @@ -13,6 +15,16 @@ type hashBits struct {
consumed int
}

func newHashBits(val string) *hashBits {
return &hashBits{b: internal.HAMTHashFunction([]byte(val))}
}

func newConsumedHashBits(val string, consumed int) *hashBits {
hv := &hashBits{b: internal.HAMTHashFunction([]byte(val))}
hv.consumed = consumed
return hv
}

func mkmask(n int) byte {
return (1 << uint(n)) - 1
}
Expand Down Expand Up @@ -50,7 +62,7 @@ func (hb *hashBits) next(i int) int {
}
}

func logtwo(v int) (int, error) {
func Logtwo(v int) (int, error) {
if v <= 0 {
return 0, fmt.Errorf("hamt size should be a power of two")
}
Expand All @@ -61,7 +73,7 @@ func logtwo(v int) (int, error) {
return lg2, nil
}

func hash(val []byte) []byte {
func murmur3Hash(val []byte) []byte {
h := murmur3.New64()
h.Write(val)
return h.Sum(nil)
Expand Down
3 changes: 3 additions & 0 deletions internal/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package internal

var HAMTHashFunction func(val []byte) []byte
95 changes: 95 additions & 0 deletions io/completehamt_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package io

import (
"context"
"encoding/binary"
"fmt"
"math"
"testing"

mdtest "github.com/ipfs/go-merkledag/test"
"github.com/stretchr/testify/assert"

"github.com/ipfs/go-unixfs"
"github.com/ipfs/go-unixfs/hamt"

ipld "github.com/ipfs/go-ipld-format"
)

// CreateCompleteHAMT creates a HAMT the following properties:
// * its height (distance/edges from root to deepest node) is specified by treeHeight.
// * all leaf Shard nodes have the same depth (and have only 'value' links).
// * all internal Shard nodes point only to other Shards (and hence have zero 'value' links).
// * the total number of 'value' links (directory entries) is:
// io.DefaultShardWidth ^ (treeHeight + 1).
// FIXME: HAMTHashFunction needs to be set to idHash by the caller. We depend on
// this simplification for the current logic to work. (HAMTHashFunction is a
// global setting of the package, it is hard-coded in the serialized Shard node
// and not allowed to be changed on a per HAMT/Shard basis.)
// (If we didn't rehash inside setValue then we could just generate
// the fake hash as in io.SetAndPrevious through `newHashBits()` and pass
// it as an argument making the hash independent of tree manipulation; that
// sounds as the correct way to go in general and we wouldn't need this.)
func CreateCompleteHAMT(ds ipld.DAGService, treeHeight int, childsPerNode int) (ipld.Node, error) {
if treeHeight < 1 {
panic("treeHeight < 1")
}
if treeHeight > 8 {
panic("treeHeight > 8: we don't allow a key larger than what can be encoded in a 64-bit word")
}

rootShard, err := hamt.NewShard(ds, childsPerNode)
if err != nil {
return nil, err
}

// Assuming we are using the ID hash function we can just insert all
// the combinations of a byte slice that will reach the desired height.
totalChildren := int(math.Pow(float64(childsPerNode), float64(treeHeight)))
log2ofChilds, err := hamt.Logtwo(childsPerNode)
if err != nil {
return nil, err
}
if log2ofChilds*treeHeight%8 != 0 {
return nil, fmt.Errorf("childsPerNode * treeHeight should be multiple of 8")
}
bytesInKey := log2ofChilds * treeHeight / 8
for i := 0; i < totalChildren; i++ {
var hashbuf [8]byte
binary.LittleEndian.PutUint64(hashbuf[:], uint64(i))
var oldLink *ipld.Link
oldLink, err = rootShard.SetAndPrevious(context.Background(), string(hashbuf[:bytesInKey]), unixfs.EmptyFileNode())
if err != nil {
return nil, err
}
if oldLink != nil {
// We shouldn't be overwriting any value, otherwise the tree
// won't be complete.
return nil, fmt.Errorf("we have overwritten entry %s",
oldLink.Cid)
}
}

return rootShard.Node()
}

// Return the same value as the hash.
func idHash(val []byte) []byte {
return val
}

func TestCreateCompleteShard(t *testing.T) {
ds := mdtest.Mock()
childsPerNode := 16
treeHeight := 2
node, err := CreateCompleteHAMT(ds, treeHeight, childsPerNode)
assert.NoError(t, err)

shard, err := hamt.NewHamtFromDag(ds, node)
assert.NoError(t, err)
links, err := shard.EnumLinks(context.Background())
assert.NoError(t, err)

childNodes := int(math.Pow(float64(childsPerNode), float64(treeHeight)))
assert.Equal(t, childNodes, len(links))
}
42 changes: 18 additions & 24 deletions io/directory.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@ package io
import (
"context"
"fmt"
mdag "github.com/ipfs/go-merkledag"
format "github.com/ipfs/go-unixfs"
"github.com/ipfs/go-unixfs/hamt"
"os"

"github.com/ipfs/go-unixfs/hamt"
"github.com/ipfs/go-unixfs/private/linksize"

"github.com/ipfs/go-cid"
ipld "github.com/ipfs/go-ipld-format"
logging "github.com/ipfs/go-log"
mdag "github.com/ipfs/go-merkledag"
format "github.com/ipfs/go-unixfs"
)

var log = logging.Logger("unixfs")
Expand All @@ -24,6 +26,7 @@ var log = logging.Logger("unixfs")
var HAMTShardingSize = 0

// DefaultShardWidth is the default value used for hamt sharding width.
// Needs to be a power of two (shard entry size) and multiple of 8 (bitfield size).
var DefaultShardWidth = 256

// Directory defines a UnixFS directory. It is used for creating, reading and
Expand Down Expand Up @@ -78,7 +81,9 @@ func productionLinkSize(linkName string, linkCid cid.Cid) int {
return len(linkName) + linkCid.ByteLen()
}

var estimatedLinkSize = productionLinkSize
func init() {
linksize.LinkSizeFunction = productionLinkSize
}

// BasicDirectory is the basic implementation of `Directory`. All the entries
// are stored in a single node.
Expand Down Expand Up @@ -167,11 +172,11 @@ func (d *BasicDirectory) computeEstimatedSize() {
}

func (d *BasicDirectory) addToEstimatedSize(name string, linkCid cid.Cid) {
d.estimatedSize += estimatedLinkSize(name, linkCid)
d.estimatedSize += linksize.LinkSizeFunction(name, linkCid)
}

func (d *BasicDirectory) removeFromEstimatedSize(name string, linkCid cid.Cid) {
d.estimatedSize -= estimatedLinkSize(name, linkCid)
d.estimatedSize -= linksize.LinkSizeFunction(name, linkCid)
if d.estimatedSize < 0 {
// Something has gone very wrong. Log an error and recompute the
// size from scratch.
Expand Down Expand Up @@ -208,10 +213,10 @@ func (d *BasicDirectory) needsToSwitchToHAMTDir(name string, nodeToAdd ipld.Node
if err != nil {
return false, err
}
operationSizeChange -= estimatedLinkSize(name, entryToRemove.Cid)
operationSizeChange -= linksize.LinkSizeFunction(name, entryToRemove.Cid)
}
if nodeToAdd != nil {
operationSizeChange += estimatedLinkSize(name, nodeToAdd.Cid())
operationSizeChange += linksize.LinkSizeFunction(name, nodeToAdd.Cid())
}

return d.estimatedSize+operationSizeChange >= HAMTShardingSize, nil
Expand Down Expand Up @@ -437,11 +442,11 @@ func (d *HAMTDirectory) switchToBasic(ctx context.Context) (*BasicDirectory, err
}

func (d *HAMTDirectory) addToSizeChange(name string, linkCid cid.Cid) {
d.sizeChange += estimatedLinkSize(name, linkCid)
d.sizeChange += linksize.LinkSizeFunction(name, linkCid)
}

func (d *HAMTDirectory) removeFromSizeChange(name string, linkCid cid.Cid) {
d.sizeChange -= estimatedLinkSize(name, linkCid)
d.sizeChange -= linksize.LinkSizeFunction(name, linkCid)
}

// Evaluate a switch from HAMTDirectory to BasicDirectory in case the size will
Expand All @@ -464,12 +469,12 @@ func (d *HAMTDirectory) needsToSwitchToBasicDir(ctx context.Context, name string
if err != nil {
return false, err
}
operationSizeChange -= estimatedLinkSize(name, entryToRemove.Cid)
operationSizeChange -= linksize.LinkSizeFunction(name, entryToRemove.Cid)
}

// For the AddEntry case compute the size addition of the new entry.
if nodeToAdd != nil {
operationSizeChange += estimatedLinkSize(name, nodeToAdd.Cid())
operationSizeChange += linksize.LinkSizeFunction(name, nodeToAdd.Cid())
}

if d.sizeChange+operationSizeChange >= 0 {
Expand Down Expand Up @@ -506,7 +511,7 @@ func (d *HAMTDirectory) sizeBelowThreshold(ctx context.Context, sizeChange int)
return false, linkResult.Err
}

partialSize += estimatedLinkSize(linkResult.Link.Name, linkResult.Link.Cid)
partialSize += linksize.LinkSizeFunction(linkResult.Link.Name, linkResult.Link.Cid)
if partialSize+sizeChange >= HAMTShardingSize {
// We have already fetched enough shards to assert we are
// above the threshold, so no need to keep fetching.
Expand Down Expand Up @@ -581,17 +586,6 @@ func (d *UpgradeableDirectory) AddChild(ctx context.Context, name string, nd ipl
return nil
}

func (d *UpgradeableDirectory) getDagService() ipld.DAGService {
switch v := d.Directory.(type) {
case *BasicDirectory:
return v.dserv
case *HAMTDirectory:
return v.dserv
default:
panic("unknown directory type")
}
}

// RemoveChild implements the `Directory` interface. Used in the case where we wrap
// a HAMTDirectory that might need to be downgraded to a BasicDirectory. The
// upgrade path is in AddChild.
Expand Down
Loading