ipfs · aschmahmann · Oct 22, 2021 · Jun 23, 2021 · Aug 27, 2021 · Aug 27, 2021
diff --git a/hamt/hamt.go b/hamt/hamt.go
@@ -25,18 +25,24 @@ import (
 	"fmt"
 	"os"
 
+	format "github.com/ipfs/go-unixfs"
+	"github.com/ipfs/go-unixfs/internal"
+
 	bitfield "github.com/ipfs/go-bitfield"
 	cid "github.com/ipfs/go-cid"
 	ipld "github.com/ipfs/go-ipld-format"
 	dag "github.com/ipfs/go-merkledag"
-	format "github.com/ipfs/go-unixfs"
 )
 
 const (
 	// HashMurmur3 is the multiformats identifier for Murmur3
 	HashMurmur3 uint64 = 0x22
 )
 
+func init() {
+	internal.HAMTHashFunction = murmur3Hash
+}
+
 func (ds *Shard) isValueNode() bool {
 	return ds.key != "" && ds.val != nil
 }
@@ -45,17 +51,29 @@ func (ds *Shard) isValueNode() bool {
 type Shard struct {
 	childer *childer
 
-	tableSize    int
+	// Entries per node (number of possible childs indexed by the partial key).
+	tableSize int
+	// Bits needed to encode child indexes (log2 of number of entries). This is
+	// the number of bits taken from the hash key on each level of the tree.
 	tableSizeLg2 int
 
 	builder  cid.Builder
 	hashFunc uint64
 
+	// String format with number of zeros that will be present in the hexadecimal
+	// encoding of the child index to always reach the fixed maxpadlen chars.
+	// Example: maxpadlen = 4 => prefixPadStr: "%04X" (print number in hexadecimal
+	// format padding with zeros to always reach 4 characters).
 	prefixPadStr string
-	maxpadlen    int
+	// Length in chars of string that encodes child indexes. We encode indexes
+	// as hexadecimal strings to this is log4 of number of entries.
+	maxpadlen int
 
 	dserv ipld.DAGService
 
+	// FIXME: Remove. We don't actually store "value nodes". This confusing
+	//  abstraction just removes the maxpadlen from the link names to extract
+	//  the actual value link the trie is storing.
 	// leaf node
 	key string
 	val *ipld.Link
@@ -68,12 +86,13 @@ func NewShard(dserv ipld.DAGService, size int) (*Shard, error) {
 		return nil, err
 	}
 
+	// FIXME: Make this at least a static configuration for testing.
 	ds.hashFunc = HashMurmur3
 	return ds, nil
 }
 
 func makeShard(ds ipld.DAGService, size int) (*Shard, error) {
-	lg2s, err := logtwo(size)
+	lg2s, err := Logtwo(size)
 	if err != nil {
 		return nil, err
 	}
@@ -211,7 +230,7 @@ func (ds *Shard) Set(ctx context.Context, name string, nd ipld.Node) error {
 // name key in this Shard or its children. It also returns the previous link
 // under that name key (if any).
 func (ds *Shard) SetAndPrevious(ctx context.Context, name string, node ipld.Node) (*ipld.Link, error) {
-	hv := &hashBits{b: hash([]byte(name))}
+	hv := newHashBits(name)
 	err := ds.dserv.Add(ctx, node)
 	if err != nil {
 		return nil, err
@@ -221,6 +240,9 @@ func (ds *Shard) SetAndPrevious(ctx context.Context, name string, node ipld.Node
 	if err != nil {
 		return nil, err
 	}
+
+	// FIXME: We don't need to set the name here, it will get overwritten.
+	//  This is confusing, confirm and remove this line.
 	lnk.Name = ds.linkNamePrefix(0) + name
 
 	return ds.setValue(ctx, hv, name, lnk)
@@ -236,13 +258,13 @@ func (ds *Shard) Remove(ctx context.Context, name string) error {
 // RemoveAndPrevious is similar to the public Remove but also returns the
 // old removed link (if it exists).
 func (ds *Shard) RemoveAndPrevious(ctx context.Context, name string) (*ipld.Link, error) {
-	hv := &hashBits{b: hash([]byte(name))}
+	hv := newHashBits(name)
 	return ds.setValue(ctx, hv, name, nil)
 }
 
 // Find searches for a child node by 'name' within this hamt
 func (ds *Shard) Find(ctx context.Context, name string) (*ipld.Link, error) {
-	hv := &hashBits{b: hash([]byte(name))}
+	hv := newHashBits(name)
 
 	var out *ipld.Link
 	err := ds.getValue(ctx, hv, name, func(sv *Shard) error {
@@ -489,10 +511,7 @@ func (ds *Shard) setValue(ctx context.Context, hv *hashBits, key string, value *
 			return nil, err
 		}
 		child.builder = ds.builder
-		chhv := &hashBits{
-			b:        hash([]byte(grandChild.key)),
-			consumed: hv.consumed,
-		}
+		chhv := newConsumedHashBits(grandChild.key, hv.consumed)
 
 		// We explicitly ignore the oldValue returned by the next two insertions
 		// (which will be nil) to highlight there is no overwrite here: they are

diff --git a/hamt/util.go b/hamt/util.go
@@ -2,9 +2,11 @@ package hamt
 
 import (
 	"fmt"
+	"math/bits"
+
+	"github.com/ipfs/go-unixfs/internal"
 
 	"github.com/spaolacci/murmur3"
-	"math/bits"
 )
 
 // hashBits is a helper that allows the reading of the 'next n bits' as an integer.
@@ -13,6 +15,16 @@ type hashBits struct {
 	consumed int
 }
 
+func newHashBits(val string) *hashBits {
+	return &hashBits{b: internal.HAMTHashFunction([]byte(val))}
+}
+
+func newConsumedHashBits(val string, consumed int) *hashBits {
+	hv := &hashBits{b: internal.HAMTHashFunction([]byte(val))}
+	hv.consumed = consumed
+	return hv
+}
+
 func mkmask(n int) byte {
 	return (1 << uint(n)) - 1
 }
@@ -50,7 +62,7 @@ func (hb *hashBits) next(i int) int {
 	}
 }
 
-func logtwo(v int) (int, error) {
+func Logtwo(v int) (int, error) {
 	if v <= 0 {
 		return 0, fmt.Errorf("hamt size should be a power of two")
 	}
@@ -61,7 +73,7 @@ func logtwo(v int) (int, error) {
 	return lg2, nil
 }
 
-func hash(val []byte) []byte {
+func murmur3Hash(val []byte) []byte {
 	h := murmur3.New64()
 	h.Write(val)
 	return h.Sum(nil)

diff --git a/internal/config.go b/internal/config.go
@@ -0,0 +1,3 @@
+package internal
+
+var HAMTHashFunction func(val []byte) []byte
diff --git a/io/completehamt_test.go b/io/completehamt_test.go
@@ -0,0 +1,95 @@
+package io
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"testing"
+
+	mdtest "github.com/ipfs/go-merkledag/test"
+	"github.com/stretchr/testify/assert"
+
+	"github.com/ipfs/go-unixfs"
+	"github.com/ipfs/go-unixfs/hamt"
+
+	ipld "github.com/ipfs/go-ipld-format"
+)
+
+// CreateCompleteHAMT creates a HAMT the following properties:
+// * its height (distance/edges from root to deepest node) is specified by treeHeight.
+// * all leaf Shard nodes have the same depth (and have only 'value' links).
+// * all internal Shard nodes point only to other Shards (and hence have zero 'value' links).
+// * the total number of 'value' links (directory entries) is:
+//   io.DefaultShardWidth ^ (treeHeight + 1).
+// FIXME: HAMTHashFunction needs to be set to idHash by the caller. We depend on
+//  this simplification for the current logic to work. (HAMTHashFunction is a
+//  global setting of the package, it is hard-coded in the serialized Shard node
+//  and not allowed to be changed on a per HAMT/Shard basis.)
+//  (If we didn't rehash inside setValue then we could just generate
+//  the fake hash as in io.SetAndPrevious through `newHashBits()` and pass
+//  it as an argument making the hash independent of tree manipulation; that
+//  sounds as the correct way to go in general and we wouldn't need this.)
+func CreateCompleteHAMT(ds ipld.DAGService, treeHeight int, childsPerNode int) (ipld.Node, error) {
+	if treeHeight < 1 {
+		panic("treeHeight < 1")
+	}
+	if treeHeight > 8 {
+		panic("treeHeight > 8: we don't allow a key larger than what can be encoded in a 64-bit word")
+	}
+
+	rootShard, err := hamt.NewShard(ds, childsPerNode)
+	if err != nil {
+		return nil, err
+	}
+
+	// Assuming we are using the ID hash function we can just insert all
+	// the combinations of a byte slice that will reach the desired height.
+	totalChildren := int(math.Pow(float64(childsPerNode), float64(treeHeight)))
+	log2ofChilds, err := hamt.Logtwo(childsPerNode)
+	if err != nil {
+		return nil, err
+	}
+	if log2ofChilds*treeHeight%8 != 0 {
+		return nil, fmt.Errorf("childsPerNode * treeHeight should be multiple of 8")
+	}
+	bytesInKey := log2ofChilds * treeHeight / 8
+	for i := 0; i < totalChildren; i++ {
+		var hashbuf [8]byte
+		binary.LittleEndian.PutUint64(hashbuf[:], uint64(i))
+		var oldLink *ipld.Link
+		oldLink, err = rootShard.SetAndPrevious(context.Background(), string(hashbuf[:bytesInKey]), unixfs.EmptyFileNode())
+		if err != nil {
+			return nil, err
+		}
+		if oldLink != nil {
+			// We shouldn't be overwriting any value, otherwise the tree
+			// won't be complete.
+			return nil, fmt.Errorf("we have overwritten entry %s",
+				oldLink.Cid)
+		}
+	}
+
+	return rootShard.Node()
+}
+
+// Return the same value as the hash.
+func idHash(val []byte) []byte {
+	return val
+}
+
+func TestCreateCompleteShard(t *testing.T) {
+	ds := mdtest.Mock()
+	childsPerNode := 16
+	treeHeight := 2
+	node, err := CreateCompleteHAMT(ds, treeHeight, childsPerNode)
+	assert.NoError(t, err)
+
+	shard, err := hamt.NewHamtFromDag(ds, node)
+	assert.NoError(t, err)
+	links, err := shard.EnumLinks(context.Background())
+	assert.NoError(t, err)
+
+	childNodes := int(math.Pow(float64(childsPerNode), float64(treeHeight)))
+	assert.Equal(t, childNodes, len(links))
+}
diff --git a/io/directory.go b/io/directory.go
@@ -3,14 +3,16 @@ package io
 import (
 	"context"
 	"fmt"
-	mdag "github.com/ipfs/go-merkledag"
-	format "github.com/ipfs/go-unixfs"
-	"github.com/ipfs/go-unixfs/hamt"
 	"os"
 
+	"github.com/ipfs/go-unixfs/hamt"
+	"github.com/ipfs/go-unixfs/private/linksize"
+
 	"github.com/ipfs/go-cid"
 	ipld "github.com/ipfs/go-ipld-format"
 	logging "github.com/ipfs/go-log"
+	mdag "github.com/ipfs/go-merkledag"
+	format "github.com/ipfs/go-unixfs"
 )
 
 var log = logging.Logger("unixfs")
@@ -24,6 +26,7 @@ var log = logging.Logger("unixfs")
 var HAMTShardingSize = 0
 
 // DefaultShardWidth is the default value used for hamt sharding width.
+// Needs to be a power of two (shard entry size) and multiple of 8 (bitfield size).
 var DefaultShardWidth = 256
 
 // Directory defines a UnixFS directory. It is used for creating, reading and
@@ -78,7 +81,9 @@ func productionLinkSize(linkName string, linkCid cid.Cid) int {
 	return len(linkName) + linkCid.ByteLen()
 }
 
-var estimatedLinkSize = productionLinkSize
+func init() {
+	linksize.LinkSizeFunction = productionLinkSize
+}
 
 // BasicDirectory is the basic implementation of `Directory`. All the entries
 // are stored in a single node.
@@ -167,11 +172,11 @@ func (d *BasicDirectory) computeEstimatedSize() {
 }
 
 func (d *BasicDirectory) addToEstimatedSize(name string, linkCid cid.Cid) {
-	d.estimatedSize += estimatedLinkSize(name, linkCid)
+	d.estimatedSize += linksize.LinkSizeFunction(name, linkCid)
 }
 
 func (d *BasicDirectory) removeFromEstimatedSize(name string, linkCid cid.Cid) {
-	d.estimatedSize -= estimatedLinkSize(name, linkCid)
+	d.estimatedSize -= linksize.LinkSizeFunction(name, linkCid)
 	if d.estimatedSize < 0 {
 		// Something has gone very wrong. Log an error and recompute the
 		// size from scratch.
@@ -208,10 +213,10 @@ func (d *BasicDirectory) needsToSwitchToHAMTDir(name string, nodeToAdd ipld.Node
 		if err != nil {
 			return false, err
 		}
-		operationSizeChange -= estimatedLinkSize(name, entryToRemove.Cid)
+		operationSizeChange -= linksize.LinkSizeFunction(name, entryToRemove.Cid)
 	}
 	if nodeToAdd != nil {
-		operationSizeChange += estimatedLinkSize(name, nodeToAdd.Cid())
+		operationSizeChange += linksize.LinkSizeFunction(name, nodeToAdd.Cid())
 	}
 
 	return d.estimatedSize+operationSizeChange >= HAMTShardingSize, nil
@@ -437,11 +442,11 @@ func (d *HAMTDirectory) switchToBasic(ctx context.Context) (*BasicDirectory, err
 }
 
 func (d *HAMTDirectory) addToSizeChange(name string, linkCid cid.Cid) {
-	d.sizeChange += estimatedLinkSize(name, linkCid)
+	d.sizeChange += linksize.LinkSizeFunction(name, linkCid)
 }
 
 func (d *HAMTDirectory) removeFromSizeChange(name string, linkCid cid.Cid) {
-	d.sizeChange -= estimatedLinkSize(name, linkCid)
+	d.sizeChange -= linksize.LinkSizeFunction(name, linkCid)
 }
 
 // Evaluate a switch from HAMTDirectory to BasicDirectory in case the size will
@@ -464,12 +469,12 @@ func (d *HAMTDirectory) needsToSwitchToBasicDir(ctx context.Context, name string
 		if err != nil {
 			return false, err
 		}
-		operationSizeChange -= estimatedLinkSize(name, entryToRemove.Cid)
+		operationSizeChange -= linksize.LinkSizeFunction(name, entryToRemove.Cid)
 	}
 
 	// For the AddEntry case compute the size addition of the new entry.
 	if nodeToAdd != nil {
-		operationSizeChange += estimatedLinkSize(name, nodeToAdd.Cid())
+		operationSizeChange += linksize.LinkSizeFunction(name, nodeToAdd.Cid())
 	}
 
 	if d.sizeChange+operationSizeChange >= 0 {
@@ -506,7 +511,7 @@ func (d *HAMTDirectory) sizeBelowThreshold(ctx context.Context, sizeChange int)
 			return false, linkResult.Err
 		}
 
-		partialSize += estimatedLinkSize(linkResult.Link.Name, linkResult.Link.Cid)
+		partialSize += linksize.LinkSizeFunction(linkResult.Link.Name, linkResult.Link.Cid)
 		if partialSize+sizeChange >= HAMTShardingSize {
 			// We have already fetched enough shards to assert we are
 			//  above the threshold, so no need to keep fetching.
@@ -581,17 +586,6 @@ func (d *UpgradeableDirectory) AddChild(ctx context.Context, name string, nd ipl
 	return nil
 }
 
-func (d *UpgradeableDirectory) getDagService() ipld.DAGService {
-	switch v := d.Directory.(type) {
-	case *BasicDirectory:
-		return v.dserv
-	case *HAMTDirectory:
-		return v.dserv
-	default:
-		panic("unknown directory type")
-	}
-}
-
 // RemoveChild implements the `Directory` interface. Used in the case where we wrap
 // a HAMTDirectory that might need to be downgraded to a BasicDirectory. The
 // upgrade path is in AddChild.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		package internal

		var HAMTHashFunction func(val []byte) []byte