Skip to content

Commit

Permalink
Significantly reduce profiling overhead
Browse files Browse the repository at this point in the history
Avoid calling runtime.GoroutineProfile() twice for every profiling
sample by reusing a dynamically growing slice.

On my machine this reduces the average time the world is stopped from 50
µsec to 25 µsec per sample. But whether or not this translates into 2x
overhead reduction in the realword will require further testing.

Additionally this patch uses a 100x more efficient approach for counting
the stacks. The old approach was taking 8 µsec per aggregation, the new
one takes about 90 ns.
  • Loading branch information
felixge committed Sep 19, 2020
1 parent 4fc6897 commit 5aa20cb
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 40 deletions.
113 changes: 79 additions & 34 deletions fgprof.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@ func Start(w io.Writer, format Format) func() error {
ticker := time.NewTicker(time.Second / hz)
stopCh := make(chan struct{})

prof := &profiler{}
stackCounts := stackCounter{}

go func() {
defer ticker.Stop()

for {
select {
case <-ticker.C:
stackCounts.Update()
stacks := prof.GoroutineProfile()
stackCounts.Update(stacks)
case <-stopCh:
return
}
Expand All @@ -36,62 +39,104 @@ func Start(w io.Writer, format Format) func() error {

return func() error {
stopCh <- struct{}{}
return writeFormat(w, stackCounts, format, hz)
return writeFormat(w, stackCounts.HumanMap(prof.SelfFrame()), format, hz)
}
}

type stackCounter map[string]int
// profiler provides a convenient and performant way to access
// runtime.GoroutineProfile().
type profiler struct {
stacks []runtime.StackRecord
selfFrame *runtime.Frame
}

func (s stackCounter) Update() {
// Determine the runtime.Frame of this func so we can hide it from our
// profiling output.
rpc := make([]uintptr, 1)
n := runtime.Callers(1, rpc)
if n < 1 {
panic("could not determine selfFrame")
// GoroutineProfile returns the stacks of all goroutines currently managed by
// the scheduler. This includes both goroutines that are currently running
// (On-CPU), as well as waiting (Off-CPU).
func (p *profiler) GoroutineProfile() []runtime.StackRecord {
if p.selfFrame == nil {
// Determine the runtime.Frame of this func so we can hide it from our
// profiling output.
rpc := make([]uintptr, 1)
n := runtime.Callers(1, rpc)
if n < 1 {
panic("could not determine selfFrame")
}
selfFrame, _ := runtime.CallersFrames(rpc).Next()
p.selfFrame = &selfFrame
}
selfFrame, _ := runtime.CallersFrames(rpc).Next()

// COPYRIGHT: The code for populating `p` below is copied from
// writeRuntimeProfile in src/runtime/pprof/pprof.go.
// We don't know how many goroutines exist, so we have to grow p.stacks
// dynamically. We overshoot by 10% since it's possible that more goroutines
// are launched in between two calls to GoroutineProfile. Once p.stacks
// reaches the maximum numnber of goroutines used by the program, it will get
// reused indefinitely, eliminating GoroutineProfile calls and allocations.
//
// Find out how many records there are (GoroutineProfile(nil)),
// allocate that many records, and get the data.
// There's a race—more records might be added between
// the two calls—so allocate a few extra records for safety
// and also try again if we're very unlucky.
// The loop should only execute one iteration in the common case.
var p []runtime.StackRecord
n, ok := runtime.GoroutineProfile(nil)
// TODO(fg) There might be workloads where it would be nice to shrink
// p.stacks dynamically as well, but let's not over-engineer this until we
// understand those cases better.
for {
// Allocate room for a slightly bigger profile,
// in case a few more entries have been added
// since the call to ThreadProfile.
p = make([]runtime.StackRecord, n+10)
n, ok = runtime.GoroutineProfile(p)
if ok {
p = p[0:n]
break
n, ok := runtime.GoroutineProfile(p.stacks)
if !ok {
p.stacks = make([]runtime.StackRecord, int(float64(n)*1.1))
} else {
return p.stacks[0:n]
}
// Profile grew; try again.
}
}

outer:
func (p *profiler) SelfFrame() *runtime.Frame {
return p.selfFrame
}

type stringStackCounter map[string]int

func (s stringStackCounter) Update(p []runtime.StackRecord) {
for _, pp := range p {
frames := runtime.CallersFrames(pp.Stack())

var stack []string
for {
frame, more := frames.Next()
stack = append([]string{frame.Function}, stack...)
if !more {
break
} else if frame.Entry == selfFrame.Entry {
continue outer
}
}
key := strings.Join(stack, ";")
s[key]++
}
}

type stackCounter map[[32]uintptr]int

func (s stackCounter) Update(p []runtime.StackRecord) {
for _, pp := range p {
s[pp.Stack0]++
}
}

// @TODO(fg) create a better interface that avoids the pprof output having to
// split the stacks using the `;` separator.
func (s stackCounter) HumanMap(exclude *runtime.Frame) map[string]int {
m := map[string]int{}
outer:
for stack0, count := range s {
frames := runtime.CallersFrames((&runtime.StackRecord{Stack0: stack0}).Stack())

var stack []string
for {
frame, more := frames.Next()
if frame.Entry == exclude.Entry {
continue outer
}
stack = append([]string{frame.Function}, stack...)
if !more {
break
}
}
key := strings.Join(stack, ";")
s[key]++
m[key] = count
}
return m
}
17 changes: 17 additions & 0 deletions fgprof_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,20 @@ func TestStart(t *testing.T) {
t.Fatalf("invalid output:\n%s", out)
}
}

func BenchmarkProfiler(b *testing.B) {
prof := &profiler{}
for i := 0; i < b.N; i++ {
prof.GoroutineProfile()
}
}

func BenchmarkStackCounter(b *testing.B) {
prof := &profiler{}
stacks := prof.GoroutineProfile()
sc := stackCounter{}
b.ResetTimer()
for i := 0; i < b.N; i++ {
sc.Update(stacks)
}
}
8 changes: 4 additions & 4 deletions format.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const (
FormatPprof Format = "pprof"
)

func writeFormat(w io.Writer, s stackCounter, f Format, hz int) error {
func writeFormat(w io.Writer, s map[string]int, f Format, hz int) error {
switch f {
case FormatFolded:
return writeFolded(w, s)
Expand All @@ -31,7 +31,7 @@ func writeFormat(w io.Writer, s stackCounter, f Format, hz int) error {
}
}

func writeFolded(w io.Writer, s stackCounter) error {
func writeFolded(w io.Writer, s map[string]int) error {
for _, stack := range sortedKeys(s) {
count := s[stack]
if _, err := fmt.Fprintf(w, "%s %d\n", stack, count); err != nil {
Expand All @@ -41,7 +41,7 @@ func writeFolded(w io.Writer, s stackCounter) error {
return nil
}

func toPprof(s stackCounter, hz int) *profile.Profile {
func toPprof(s map[string]int, hz int) *profile.Profile {
functionID := uint64(1)
locationID := uint64(1)
line := int64(1)
Expand Down Expand Up @@ -92,7 +92,7 @@ func toPprof(s stackCounter, hz int) *profile.Profile {
return p
}

func sortedKeys(s stackCounter) []string {
func sortedKeys(s map[string]int) []string {
var keys []string
for stack := range s {
keys = append(keys, stack)
Expand Down
2 changes: 1 addition & 1 deletion pprof.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"github.com/google/pprof/profile"
)

func toProfile(s stackCounter, hz int) *profile.Profile {
func toProfile(s map[string]int, hz int) *profile.Profile {
functionID := uint64(1)
locationID := uint64(1)

Expand Down
2 changes: 1 addition & 1 deletion pprof_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
)

func Test_toProfile(t *testing.T) {
s := stackCounter{
s := map[string]int{
"foo;bar": 2,
"foo": 1,
}
Expand Down

0 comments on commit 5aa20cb

Please sign in to comment.