Skip to content

Commit

Permalink
BREAKING CHANGE: change Pointer CBOR representation to a kinded union (
Browse files Browse the repository at this point in the history
…#60)

From:

	type Pointer union {
		&Node "0"
		Bucket "1"
	} representation keyed

i.e. {"0": CID} or {"1": [KV...]}

To:

	type Pointer union {
		&Node link
		Bucket list
	} representation kinded

i.e. CID or [KV...]

Also removes redundant refmt tags

Closes: https://github.com/ipfs/go-hamt-ipld/issues/53
  • Loading branch information
rvagg authored Dec 3, 2020
1 parent 569969d commit 5cdbb51
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 146 deletions.
30 changes: 13 additions & 17 deletions hamt.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,15 @@ var ErrMalformedHamt = fmt.Errorf("HAMT node was malformed")
// array. Indexes `[1]` and `[2]` are not present, but index `[3]` is at
// the second position of our Pointers array.
//
// (Note: the `refmt` tags are ignored by cbor-gen which will generate an
// array type rather than map.)
//
// The IPLD Schema representation of this data structure is as follows:
//
// type Node struct {
// bitfield Bytes
// pointers [Pointer]
// } representation tuple
type Node struct {
Bitfield *big.Int `refmt:"bf"`
Pointers []*Pointer `refmt:"p"`
Bitfield *big.Int
Pointers []*Pointer

bitWidth int
hash func([]byte) []byte
Expand All @@ -82,10 +79,11 @@ type Node struct {

// Pointer is an element in a HAMT node's Pointers array, encoded as an IPLD
// tuple in DAG-CBOR of shape:
// {"0": CID} or {"1": [KV...]}
// Where a map with a single key of "0" contains a Link, where a map with a
// single key of "1" contains a KV bucket. The map may contain only one of
// these two possible keys.
// CID or [KV...]
// i.e. it is represented as a "kinded union" where a Link is a pointer to a
// child node, while an array is a bucket of elements local to this node. A
// Pointer must represent exactly one of of these two states and cannot be both
// (or neither).
//
// There are between 1 and 2^bitWidth of these Pointers in any HAMT node.
//
Expand All @@ -94,20 +92,17 @@ type Node struct {
// the bucket is replaced with a link to a newly created HAMT node which will
// contain the `bucketSize+1` elements in its own Pointers array.
//
// (Note: the `refmt` tags are ignored by cbor-gen which will generate an
// array type rather than map.)
//
// The IPLD Schema representation of this data structure is as follows:
//
// type Pointer union {
// &Node "0"
// Bucket "1"
// } representation keyed
// &Node link
// Bucket list
// } representation kinded
//
// type Bucket [KV]
type Pointer struct {
KVs []*KV `refmt:"v,omitempty"`
Link cid.Cid `refmt:"l,omitempty"`
KVs []*KV
Link cid.Cid

// cache is a pointer to an in-memory Node, which may or may not be
// present, and corresponds to the Link field, which also may or may not
Expand Down Expand Up @@ -389,6 +384,7 @@ func loadNode(
isLink := ch.isShard()
isBucket := ch.KVs != nil
if !((isLink && !isBucket) || (!isLink && isBucket)) {
// Pointer#UnmarshalCBOR shouldn't allow this
return nil, ErrMalformedHamt
}
if isLink && ch.Link.Type() != cid.DagCBOR { // not dag-cbor
Expand Down
85 changes: 19 additions & 66 deletions hamt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -980,10 +980,9 @@ func TestMalformedHamt(t *testing.T) {
bcat(b(0x40+1), b(kv.key)), // bytes(1) "\x??"
bcat(b(0x40+1), b(kv.value)))) // bytes(1) "\x??"
}
return bcat(b(0xa0+1), // map(1)
bcat(b(0x60+1), b(0x31)), // string(1) "1"
bcat(b(0x80+byte(len(kvs))), // array(?)
en)) // bucket contents
return bcat(
b(0x80+byte(len(kvs))), // array(?)
en) // bucket contents
}

// most minimal HAMT node with one k/v entry, sanity check we can load this
Expand Down Expand Up @@ -1040,65 +1039,25 @@ func TestMalformedHamt(t *testing.T) {
t.Fatal("Should have returned ErrMalformedHamt for mismatch bitfield count")
}

// test mixed link & bucket

// this node contains 2 elements, the first is a plain entry with one bucket
// and with a single key of 0x0100, the second element is a link to a child
// node which happens to be the same CID as this node will be stored in.
// However, this second entry has both a CID and a bucket in the same
// element, which is not allowed. Without checks for exactly one of these
// two things then then a lookup for key 0x0100 would navigate through this
// node and back again as its own child to the first element.
store(
bcat(b(0x80+2), // array(2)
bcat(b(0x40+1), b(0x03)), // bytes(1) "\x03" (bitmap)
bcat(b(0x80+2), // array(2)
bcat(b(0xa0+1), // map(1)
bcat(b(0x60+1), b(0x31)), // string(1) "1"
bcat(b(0x80+1), // array(1)
bcat(b(0x80+2), // array(2)
bcat(b(0x40+2), []byte{0x01, 0x00}), // bytes(2) "\x0100"
bcat(b(0x40+1), b(0xff))))), // bytes(1) "\xff"
bcat(b(0xa0+2), // map(2)
bcat(b(0x60+1), b(0x30)), // string(1) "0"
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
cidBytes), // cid
bcat(b(0x60+1), b(0x31)), // string(1) "1"
bcat(b(0x80+1), // array(1)
bcat(b(0x80+2), // array(2)
bcat(b(0x40+1), b(0x01)), // bytes(1) "\x00"
bcat(b(0x40+1), b(0xfe)))))))) // bytes(1) "\xff

n, err = LoadNode(ctx, cs, bcid, UseTreeBitWidth(8), UseHashFunction(identityHash))
if err == nil || n != nil || err.Error() != "Pointers should be a single element map" {
// no ErrMalformedHamt here possible bcause of cbor-gen wrapping
t.Fatal("Should have returned error for bad Pointer cbor")
}

// test pointers with links have are DAG-CBOR multicodec
// sanity check minimal node pointing to a child node
store(
bcat(b(0x80+2), // array(2)
bcat(b(0x40+1), b(0x01)), // bytes(1) "\x01" (bitmap)
bcat(b(0x80+1), // array(1)
bcat(b(0xa0+1), // map(1)
bcat(b(0x60+1), b(0x30)), // string(1) "0"
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
cidBytes))))) // cid
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
cidBytes)))) // cid
load()

// node pointing to a non-dag-cbor node
store(
bcat(b(0x80+2), // array(2)
bcat(b(0x40+1), b(0x01)), // bytes(1) "\x01" (bitmap)
bcat(b(0x80+1), // array(1)
bcat(b(0xa0+1), // map(1)
bcat(b(0x60+1), b(0x30)), // string(1) "0"
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
badCidBytes))))) // cid
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
badCidBytes)))) // cid
n, err = LoadNode(ctx, cs, bcid, UseTreeBitWidth(8), UseHashFunction(identityHash))
if err != ErrMalformedHamt || n != nil {
t.Fatal("Should have returned ErrMalformedHamt for bad child link codec")
Expand Down Expand Up @@ -1178,11 +1137,9 @@ func TestMalformedHamt(t *testing.T) {
bcat(b(0x80+2), // array(2)
bcat(b(0x40+1), b(0x01)), // bytes(1) "\x01" (bitmap)
bcat(b(0x80+1), // array(1)
bcat(b(0xa0+1), // map(1)
bcat(b(0x60+1), b(0x30)), // string(1) "0"
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
ccidBytes))))) // cid
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
ccidBytes)))) // cid

vg, err := load().FindRaw(ctx, string([]byte{0x00, 0x01}))
// without validation of the child block, this would return an ErrNotFound
Expand Down Expand Up @@ -1221,17 +1178,13 @@ func TestMalformedHamt(t *testing.T) {
bcat(b(0x80+2), // array(2)
bcat(b(0x40+1), b(0x03)), // bytes(1) "\x03" (bitmap)
bcat(b(0x80+2), // array(2)
bcat(b(0xa0+1), // map(1)
bcat(b(0x60+1), b(0x31)), // string(1) "1"
bcat(b(0x80+1), // array(1)
bcat(b(0x80+2), // array(2)
bcat(b(0x40+2), []byte{0x00, 0x01}), // bytes(2) "\x0001"
bcat(b(0x40+1), b(0xff))))), // bytes(1) "\xff"
bcat(b(0xa0+1), // map(1)
bcat(b(0x60+1), b(0x30)), // string(1) "0"
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
ccidBytes))))) // cid
bcat(b(0x80+1), // array(1)
bcat(b(0x80+2), // array(2)
bcat(b(0x40+2), []byte{0x00, 0x01}), // bytes(2) "\x0001"
bcat(b(0x40+1), b(0xff)))), // bytes(1) "\xff"
bcat(b(0xd8), b(0x2a), // tag(42)
b(0x58), b(0x27), // bytes(39)
ccidBytes)))) // cid

vg, err = load().FindRaw(ctx, string([]byte{0x00, 0x01}))
// without validation of the child block, this would return an ErrNotFound
Expand Down
96 changes: 33 additions & 63 deletions pointer_cbor.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ import (
cbg "github.com/whyrusleeping/cbor-gen"
)

var keyZero = []byte("0")
var keyOne = []byte("1")
// implemented as a kinded union - a "Pointer" is either a Link (child node) or
// an Array (bucket)

func (t *Pointer) MarshalCBOR(w io.Writer) error {
if t.Link != cid.Undef && len(t.KVs) > 0 {
Expand All @@ -18,36 +18,11 @@ func (t *Pointer) MarshalCBOR(w io.Writer) error {

scratch := make([]byte, 9)

if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajMap, 1); err != nil {
return err
}

if t.Link != cid.Undef {
// key for links is "0"
// Refmt (and the general IPLD data model currently) can't deal
// with non string keys. So we have this weird restriction right now
// hoping to be able to use integer keys soon
if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajTextString, 1); err != nil {
return err
}

if _, err := w.Write(keyZero); err != nil {
return err
}

if err := cbg.WriteCidBuf(scratch, w, t.Link); err != nil {
return err
}
} else {
// key for KVs is "1"
if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajTextString, 1); err != nil {
return err
}

if _, err := w.Write(keyOne); err != nil {
return err
}

if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajArray, uint64(len(t.KVs))); err != nil {
return err
}
Expand All @@ -69,51 +44,29 @@ func (t *Pointer) UnmarshalCBOR(br io.Reader) error {
if err != nil {
return err
}
if maj != cbg.MajMap {
return fmt.Errorf("cbor input should be of map")
}

if extra != 1 {
return fmt.Errorf("Pointers should be a single element map")
}

maj, val, err := cbg.CborReadHeaderBuf(br, scratch)
if err != nil {
return err
}

if maj != cbg.MajTextString {
return fmt.Errorf("expected text string key")
}

if val != 1 {
return fmt.Errorf("map keys in pointers must be a single byte long")
}

if _, err := io.ReadAtLeast(br, scratch[:1], 1); err != nil {
return err
}
if maj == cbg.MajTag {
if extra != 42 {
return fmt.Errorf("expected tag 42 for child node link")
}

switch scratch[0] {
case '0':
c, err := cbg.ReadCid(br)
ba, err := cbg.ReadByteArray(br, 512)
if err != nil {
return err
}
t.Link = c
return nil
case '1':
maj, length, err := cbg.CborReadHeaderBuf(br, scratch)

c, err := bufToCid(ba)
if err != nil {
return err
}

if maj != cbg.MajArray {
return fmt.Errorf("expected an array of KVs in cbor input")
}
t.Link = c
return nil
} else if maj == cbg.MajArray {
length := extra

if length > 32 {
return fmt.Errorf("KV array in cbor input for pointer was too long")
return fmt.Errorf("KV array in CBOR input for pointer was too long")
}

t.KVs = make([]*KV, length)
Expand All @@ -127,7 +80,24 @@ func (t *Pointer) UnmarshalCBOR(br io.Reader) error {
}

return nil
default:
return fmt.Errorf("invalid pointer map key in cbor input: %d", val)
} else {
return fmt.Errorf("expected CBOR child node link or array")
}
}

// from https://github.com/whyrusleeping/cbor-gen/blob/211df3b9e24c6e0d0c338b440e6ab4ab298505b2/utils.go#L530
func bufToCid(buf []byte) (cid.Cid, error) {
if len(buf) == 0 {
return cid.Undef, fmt.Errorf("undefined CID")
}

if len(buf) < 2 {
return cid.Undef, fmt.Errorf("DAG-CBOR serialized CIDs must have at least two bytes")
}

if buf[0] != 0 {
return cid.Undef, fmt.Errorf("DAG-CBOR serialized CIDs must have binary multibase")
}

return cid.Cast(buf[1:])
}

0 comments on commit 5cdbb51

Please sign in to comment.