btf: replace postorderIterator with recursion

Throw out the painstakingly hand optimized postorder iterator in favour of a simple recursive function. Turns out this can be a lot faster for larger types, probably because the visited map can be allocated on the stack. There is a small hit to very simple types, but it doesn't seem to affect overall benchmarks too much. core: 1 goos: linux goarch: amd64 pkg: github.com/cilium/ebpf/btf cpu: 12th Gen Intel(R) Core(TM) i7-1260P │ base-po.txt │ recursive.txt │ │ sec/op │ sec/op vs base │ PostorderTraversal/single_type 32.22n ± 0% 39.69n ± 0% +23.18% (p=0.002 n=6) PostorderTraversal/cycle(1) 576.3n ± 3% 112.3n ± 0% -80.51% (p=0.002 n=6) PostorderTraversal/cycle(10) 2.807µ ± 1% 1.578µ ± 1% -43.80% (p=0.002 n=6) PostorderTraversal/gov_update_cpu_data 2.039m ± 1% 1.795m ± 1% -11.97% (p=0.002 n=6) geomean 3.211µ 1.885µ -41.30% │ base-po.txt │ recursive.txt │ │ B/op │ B/op vs base │ PostorderTraversal/single_type 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=6) ¹ PostorderTraversal/cycle(1) 264.0 ± 0% 0.0 ± 0% -100.00% (p=0.002 n=6) PostorderTraversal/cycle(10) 716.5 ± 0% 326.0 ± 0% -54.50% (p=0.002 n=6) PostorderTraversal/gov_update_cpu_data 345.1Ki ± 0% 334.8Ki ± 0% -2.98% (p=0.002 n=6) geomean ² ? ² ³ ¹ all samples are equal ² summaries must be >0 to compute geomean ³ ratios must be >0 to compute geomean │ base-po.txt │ recursive.txt │ │ allocs/op │ allocs/op vs base │ PostorderTraversal/single_type 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=6) ¹ PostorderTraversal/cycle(1) 4.000 ± 0% 0.000 ± 0% -100.00% (p=0.002 n=6) PostorderTraversal/cycle(10) 7.000 ± 0% 1.000 ± 0% -85.71% (p=0.002 n=6) PostorderTraversal/gov_update_cpu_data 132.0 ± 1% 115.0 ± 1% -12.88% (p=0.002 n=6) geomean ² ? ² ³ ¹ all samples are equal ² summaries must be >0 to compute geomean ³ ratios must be >0 to compute geomean Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
cilium · Mar 27, 2024 · f631fcc · f631fcc
1 parent a99904d
commit f631fcc
Show file tree

Hide file tree

Showing 6 changed files with 129 additions and 157 deletions.
diff --git a/btf/marshal.go b/btf/marshal.go
@@ -5,6 +5,7 @@ import (
 	"encoding/binary"
 	"errors"
 	"fmt"
+	"maps"
 	"math"
 	"slices"
 	"sync"
@@ -38,6 +39,7 @@ type encoder struct {
 	buf     *bytes.Buffer
 	strings *stringTableBuilder
 	ids     map[Type]TypeID
+	visited map[Type]struct{}
 	lastID  TypeID
 }
 
@@ -163,15 +165,15 @@ func (b *Builder) Marshal(buf []byte, opts *MarshalOptions) ([]byte, error) {
 		buf:            w,
 		strings:        stb,
 		lastID:         TypeID(len(b.types)),
-		ids:            make(map[Type]TypeID, len(b.types)),
+		visited:        make(map[Type]struct{}, len(b.types)),
+		ids:            maps.Clone(b.stableIDs),
 	}
 
 	// Ensure that types are marshaled in the exact order they were Add()ed.
 	// Otherwise the ID returned from Add() won't match.
 	e.pending.Grow(len(b.types))
 	for _, typ := range b.types {
 		e.pending.Push(typ)
-		e.ids[typ] = b.stableIDs[typ]
 	}
 
 	if err := e.deflatePending(); err != nil {
@@ -218,16 +220,28 @@ func (b *Builder) addString(str string) (uint32, error) {
 	return b.strings.Add(str)
 }
 
-func (e *encoder) allocateID(typ Type) error {
-	id := e.lastID + 1
-	if id < e.lastID {
-		return errors.New("type ID overflow")
-	}
+func (e *encoder) allocateIDs(root Type) (err error) {
+	visitInPostorder(root, e.visited, func(typ Type) bool {
+		if _, ok := typ.(*Void); ok {
+			return true
+		}
 
-	e.pending.Push(typ)
-	e.ids[typ] = id
-	e.lastID = id
-	return nil
+		if _, ok := e.ids[typ]; ok {
+			return true
+		}
+
+		id := e.lastID + 1
+		if id < e.lastID {
+			err = errors.New("type ID overflow")
+			return false
+		}
+
+		e.pending.Push(typ)
+		e.ids[typ] = id
+		e.lastID = id
+		return true
+	})
+	return
 }
 
 // id returns the ID for the given type or panics with an error.
@@ -247,33 +261,13 @@ func (e *encoder) id(typ Type) TypeID {
 func (e *encoder) deflatePending() error {
 	// Declare root outside of the loop to avoid repeated heap allocations.
 	var root Type
-	skip := func(t Type) (skip bool) {
-		if t == root {
-			// Force descending into the current root type even if it already
-			// has an ID. Otherwise we miss children of types that have their
-			// ID pre-allocated via Add.
-			return false
-		}
-
-		_, isVoid := t.(*Void)
-		_, alreadyEncoded := e.ids[t]
-		return isVoid || alreadyEncoded
-	}
 
 	for !e.pending.Empty() {
 		root = e.pending.Shift()
 
 		// Allocate IDs for all children of typ, including transitive dependencies.
-		iter := postorderTraversal(root, skip)
-		for iter.Next() {
-			if iter.Type == root {
-				// The iterator yields root at the end, do not allocate another ID.
-				break
-			}
-
-			if err := e.allocateID(iter.Type); err != nil {
-				return err
-			}
+		if err := e.allocateIDs(root); err != nil {
+			return err
 		}
 
 		if err := e.deflateType(root); err != nil {
@@ -498,7 +492,7 @@ func (e *encoder) deflateEnum64(raw *rawType, enum *Enum) (err error) {
 		if enum.Signed {
 			placeholder.Encoding = Signed
 		}
-		if err := e.allocateID(placeholder); err != nil {
+		if err := e.allocateIDs(placeholder); err != nil {
 			return fmt.Errorf("add enum64 placeholder: %w", err)
 		}
 

diff --git a/btf/marshal_test.go b/btf/marshal_test.go
@@ -79,16 +79,11 @@ func TestRoundtripVMlinux(t *testing.T) {
 		types[i+1], types[j+1] = types[j+1], types[i+1]
 	})
 
-	seen := make(map[Type]bool)
+	visited := make(map[Type]struct{})
 limitTypes:
 	for i, typ := range types {
-		iter := postorderTraversal(typ, func(t Type) (skip bool) {
-			return seen[t]
-		})
-		for iter.Next() {
-			seen[iter.Type] = true
-		}
-		if len(seen) >= math.MaxInt16 {
+		visitInPostorder(typ, visited, func(t Type) bool { return true })
+		if len(visited) >= math.MaxInt16 {
 			// IDs exceeding math.MaxUint16 can trigger a bug when loading BTF.
 			// This can be removed once the patch lands.
 			// See https://lore.kernel.org/bpf/20220909092107.3035-1-oss@lmb.io/

diff --git a/btf/traversal.go b/btf/traversal.go
@@ -2,142 +2,122 @@ package btf
 
 import (
 	"fmt"
-
-	"github.com/cilium/ebpf/internal"
 )
 
 // Functions to traverse a cyclic graph of types. The below was very useful:
 // https://eli.thegreenplace.net/2015/directed-graph-traversal-orderings-and-applications-to-data-flow-analysis/#post-order-and-reverse-post-order
 
-type postorderIterator struct {
-	// Iteration skips types for which this function returns true.
-	skip func(Type) bool
-	// The root type. May be nil if skip(root) is true.
-	root Type
-
-	// Contains types which need to be either walked or yielded.
-	types typeDeque
-	// Contains a boolean whether the type has been walked or not.
-	walked internal.Deque[bool]
-	// The set of types which has been pushed onto types.
-	pushed map[Type]struct{}
-
-	// The current type. Only valid after a call to Next().
-	Type Type
-}
-
-// postorderTraversal iterates all types reachable from root by visiting the
-// leaves of the graph first.
+// Visit all types reachable from root in postorder.
 //
-// Types for which skip returns true are ignored. skip may be nil.
-func postorderTraversal(root Type, skip func(Type) (skip bool)) postorderIterator {
-	// Avoid allocations for the common case of a skipped root.
-	if skip != nil && skip(root) {
-		return postorderIterator{}
-	}
-
-	po := postorderIterator{root: root, skip: skip}
-	children(root, po.push)
-
-	return po
-}
-
-func (po *postorderIterator) push(t *Type) {
-	if _, ok := po.pushed[*t]; ok || *t == po.root {
-		return
+// Traversal stops if yield returns false.
+//
+// Returns false if traversal was aborted.
+func visitInPostorder(root Type, visited map[Type]struct{}, yield func(typ Type) bool) bool {
+	if _, ok := visited[root]; ok {
+		return true
 	}
-
-	if po.skip != nil && po.skip(*t) {
-		return
+	if visited == nil {
+		visited = make(map[Type]struct{})
 	}
+	visited[root] = struct{}{}
 
-	if po.pushed == nil {
-		// Lazily allocate pushed to avoid an allocation for Types without children.
-		po.pushed = make(map[Type]struct{})
+	cont := children(root, func(child *Type) bool {
+		return visitInPostorder(*child, visited, yield)
+	})
+	if !cont {
+		return false
 	}
 
-	po.pushed[*t] = struct{}{}
-	po.types.Push(t)
-	po.walked.Push(false)
+	return yield(root)
 }
 
-// Next returns true if there is another Type to traverse.
-func (po *postorderIterator) Next() bool {
-	for !po.types.Empty() {
-		t := po.types.Pop()
-
-		if !po.walked.Pop() {
-			// Push the type again, so that we re-evaluate it in done state
-			// after all children have been handled.
-			po.types.Push(t)
-			po.walked.Push(true)
-
-			// Add all direct children to todo.
-			children(*t, po.push)
-		} else {
-			// We've walked this type previously, so we now know that all
-			// children have been handled.
-			po.Type = *t
-			return true
-		}
-	}
-
-	// Only return root once.
-	po.Type, po.root = po.root, nil
-	return po.Type != nil
-}
-
-// children calls fn on each child of typ.
+// children calls yield on each child of typ.
 //
-// Iteration stops early if fn returns false.
-func children(typ Type, fn func(child *Type)) {
+// Traversal stops if yield returns false.
+//
+// Returns false if traversal was aborted.
+func children(typ Type, yield func(child *Type) bool) bool {
 	// Explicitly type switch on the most common types to allow the inliner to
 	// do its work. This avoids allocating intermediate slices from walk() on
 	// the heap.
 	switch v := typ.(type) {
 	case *Void, *Int, *Enum, *Fwd, *Float:
 		// No children to traverse.
 	case *Pointer:
-		fn(&v.Target)
+		if !yield(&v.Target) {
+			return false
+		}
 	case *Array:
-		fn(&v.Index)
-		fn(&v.Type)
+		if !yield(&v.Index) {
+			return false
+		}
+		if !yield(&v.Type) {
+			return false
+		}
 	case *Struct:
 		for i := range v.Members {
-			fn(&v.Members[i].Type)
+			if !yield(&v.Members[i].Type) {
+				return false
+			}
 		}
 	case *Union:
 		for i := range v.Members {
-			fn(&v.Members[i].Type)
+			if !yield(&v.Members[i].Type) {
+				return false
+			}
 		}
 	case *Typedef:
-		fn(&v.Type)
+		if !yield(&v.Type) {
+			return false
+		}
 	case *Volatile:
-		fn(&v.Type)
+		if !yield(&v.Type) {
+			return false
+		}
 	case *Const:
-		fn(&v.Type)
+		if !yield(&v.Type) {
+			return false
+		}
 	case *Restrict:
-		fn(&v.Type)
+		if !yield(&v.Type) {
+			return false
+		}
 	case *Func:
-		fn(&v.Type)
+		if !yield(&v.Type) {
+			return false
+		}
 	case *FuncProto:
-		fn(&v.Return)
+		if !yield(&v.Return) {
+			return false
+		}
 		for i := range v.Params {
-			fn(&v.Params[i].Type)
+			if !yield(&v.Params[i].Type) {
+				return false
+			}
 		}
 	case *Var:
-		fn(&v.Type)
+		if !yield(&v.Type) {
+			return false
+		}
 	case *Datasec:
 		for i := range v.Vars {
-			fn(&v.Vars[i].Type)
+			if !yield(&v.Vars[i].Type) {
+				return false
+			}
 		}
 	case *declTag:
-		fn(&v.Type)
+		if !yield(&v.Type) {
+			return false
+		}
 	case *typeTag:
-		fn(&v.Type)
+		if !yield(&v.Type) {
+			return false
+		}
 	case *cycle:
 		// cycle has children, but we ignore them deliberately.
 	default:
 		panic(fmt.Sprintf("don't know how to walk Type %T", v))
 	}
+
+	return true
 }