Skip to content

Commit

Permalink
GPU Fix call stack issue and support ggml (#1546)
Browse files Browse the repository at this point in the history
  • Loading branch information
grcevski authored Jan 20, 2025
1 parent 05851de commit 7dd2e9d
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 23 deletions.
52 changes: 29 additions & 23 deletions pkg/internal/ebpf/gpuevent/gpuevent.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ type modInfo struct {
ino uint64
}

type offsetsMap map[int64]string
type moduleOffsets map[uint64]offsetsMap
type moduleOffsets map[uint64]*SymbolTree

type GPUKernelLaunchInfo bpfGpuKernelLaunchT
type GPUMallocInfo bpfGpuMallocT
Expand Down Expand Up @@ -269,7 +268,7 @@ func (p *Tracer) readGPUKernelLaunchIntoSpan(record *ringbuf.Record) (request.Sp
}

// Log the GPU Kernel Launch event
p.log.Debug("GPU Kernel Launch", "event", event)
p.log.Info("GPU Kernel Launch", "event", event)

// Find the symbol for the kernel launch
symbol, ok := p.symForAddr(int32(event.PidInfo.UserPid), event.PidInfo.Ns, event.KernFuncOff)
Expand All @@ -288,27 +287,24 @@ func (p *Tracer) callStack(event *GPUKernelLaunchInfo) string {
if event.UstackSz > 1 {
cs := []string{}

for i := 1; i < int(event.UstackSz); i++ {
for i := 0; i < int(event.UstackSz); i++ {
addr := event.Ustack[i]
if addr != 0 {
symbol, ok := p.symForAddr(int32(event.PidInfo.UserPid), event.PidInfo.Ns, event.KernFuncOff)
if !ok {
symbol = "<unknown>"
} else {
symbol, ok := p.symForAddr(int32(event.PidInfo.UserPid), event.PidInfo.Ns, addr)
if ok {
symbol = p.symToName(symbol)
cs = append(cs, symbol)
}

cs = append(cs, symbol)
}
}

return strings.Join(cs, " <- ")
return strings.Join(cs, ";")
}

return ""
}

func (p *Tracer) processCudaLibFileInfo(info *exec.FileInfo, lib string, maps []*procfs.ProcMap, symMods moduleOffsets) (map[int64]string, *procfs.ProcMap, bool) {
func (p *Tracer) processCudaLibFileInfo(info *exec.FileInfo, lib string, maps []*procfs.ProcMap, symMods moduleOffsets) (*SymbolTree, *procfs.ProcMap, bool) {
cudaMap := exec.LibPathPlain(lib, maps)

if cudaMap == nil {
Expand Down Expand Up @@ -379,6 +375,11 @@ func (p *Tracer) processCudaFileInfo(info *exec.FileInfo) {
disovered = append(disovered, mod)
}
}
if strings.Contains(m.Pathname, "/ggml") {
if mod := p.discoverModule(info, maps, symModules, m.Pathname); mod != nil {
disovered = append(disovered, mod)
}
}
}

p.log.Debug("Processing cuda symbol map for", "inode", info.Ino)
Expand Down Expand Up @@ -475,8 +476,11 @@ func (p *Tracer) symForAddr(pid int32, ns uint32, off uint64) (string, bool) {
if off > m.base && off < m.end {
modSyms, ok := syms[m.ino]
if ok {
sym, ok := modSyms[int64(off)-int64(m.base)]
return sym, ok
res := modSyms.Search(off - m.base)
if len(res) > 0 {
return res[0].Symbol, true
}
return "", false
} else {
p.log.Warn("Can't find mod sym for", "ino", m.ino)
}
Expand All @@ -486,14 +490,14 @@ func (p *Tracer) symForAddr(pid int32, ns uint32, off uint64) (string, bool) {
return "", false
}

func (p *Tracer) collectSymbols(f *elf.File, syms []elf.Symbol, addressToName map[int64]string) {
func (p *Tracer) collectSymbols(f *elf.File, syms []elf.Symbol, tree *SymbolTree) {
for _, s := range syms {
if elf.ST_TYPE(s.Info) != elf.STT_FUNC {
// Symbol not associated with a function or other executable code.
continue
}

address := int64(s.Value)
address := s.Value
// Loop over ELF segments.
for _, prog := range f.Progs {
// Skip uninteresting segments.
Expand All @@ -502,30 +506,32 @@ func (p *Tracer) collectSymbols(f *elf.File, syms []elf.Symbol, addressToName ma
}

if prog.Vaddr <= s.Value && s.Value < (prog.Vaddr+prog.Memsz) {
address = int64(s.Value) - int64(prog.Vaddr)
address = s.Value - prog.Vaddr
break
}
}
addressToName[address] = s.Name
if address != 0 {
tree.Insert(Symbol{Low: address, High: address + s.Size, Symbol: s.Name})
}
}
}

// returns a map of symbol addresses to names
func (p *Tracer) findSymbolAddresses(f *elf.File) (map[int64]string, error) {
addressToName := map[int64]string{}
func (p *Tracer) findSymbolAddresses(f *elf.File) (*SymbolTree, error) {
t := SymbolTree{}
syms, err := f.Symbols()
if err != nil && !errors.Is(err, elf.ErrNoSymbols) {
return nil, err
}

p.collectSymbols(f, syms, addressToName)
p.collectSymbols(f, syms, &t)

dynsyms, err := f.DynamicSymbols()
if err != nil && !errors.Is(err, elf.ErrNoSymbols) {
return nil, err
}

p.collectSymbols(f, dynsyms, addressToName)
p.collectSymbols(f, dynsyms, &t)

return addressToName, nil
return &t, nil
}
74 changes: 74 additions & 0 deletions pkg/internal/ebpf/gpuevent/symboltree.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package gpuevent

// Interval represents an interval with a low and high value
type Symbol struct {
Low, High uint64
Symbol string
}

// Node represents a node in the interval tree
type Node struct {
Symbol Symbol
Max uint64
Left *Node
Right *Node
}

// SymbolTree represents the interval tree
type SymbolTree struct {
Root *Node
}

// NewSymbolTree creates a new interval tree
func NewSymbolTree() *SymbolTree {
return &SymbolTree{}
}

// Insert inserts a new interval into the interval tree
func (t *SymbolTree) Insert(sym Symbol) {
t.Root = insert(t.Root, sym)
}

func insert(root *Node, sym Symbol) *Node {
if root == nil {
return &Node{
Symbol: sym,
Max: sym.High,
}
}

if sym.Low < root.Symbol.Low {
root.Left = insert(root.Left, sym)
} else {
root.Right = insert(root.Right, sym)
}

if root.Max < sym.High {
root.Max = sym.High
}

return root
}

// Search searches for intervals that overlap with the given point
func (t *SymbolTree) Search(point uint64) []Symbol {
var result []Symbol
search(t.Root, point, &result)
return result
}

func search(root *Node, point uint64, result *[]Symbol) {
if root == nil {
return
}

if root.Symbol.Low <= point && point < root.Symbol.High {
*result = append(*result, root.Symbol)
}

if root.Left != nil && root.Left.Max >= point {
search(root.Left, point, result)
}

search(root.Right, point, result)
}
18 changes: 18 additions & 0 deletions pkg/internal/ebpf/gpuevent/symboltree_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package gpuevent

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestSymbolTree(t *testing.T) {
tr := &SymbolTree{}

tr.Insert(Symbol{Low: 100, High: 200, Symbol: "test"})
tr.Insert(Symbol{Low: 200, High: 300, Symbol: "test2"})

r := tr.Search(200)
assert.Equal(t, 1, len(r))
assert.Equal(t, "test2", r[0].Symbol)
}

0 comments on commit 7dd2e9d

Please sign in to comment.