From 7dd2e9d435718e91293a86e992df18c75ee5fa97 Mon Sep 17 00:00:00 2001 From: Nikola Grcevski <6207777+grcevski@users.noreply.github.com> Date: Mon, 20 Jan 2025 10:01:19 -0500 Subject: [PATCH] GPU Fix call stack issue and support ggml (#1546) --- pkg/internal/ebpf/gpuevent/gpuevent.go | 52 +++++++------ pkg/internal/ebpf/gpuevent/symboltree.go | 74 +++++++++++++++++++ pkg/internal/ebpf/gpuevent/symboltree_test.go | 18 +++++ 3 files changed, 121 insertions(+), 23 deletions(-) create mode 100644 pkg/internal/ebpf/gpuevent/symboltree.go create mode 100644 pkg/internal/ebpf/gpuevent/symboltree_test.go diff --git a/pkg/internal/ebpf/gpuevent/gpuevent.go b/pkg/internal/ebpf/gpuevent/gpuevent.go index 181774202..42bb81576 100644 --- a/pkg/internal/ebpf/gpuevent/gpuevent.go +++ b/pkg/internal/ebpf/gpuevent/gpuevent.go @@ -44,8 +44,7 @@ type modInfo struct { ino uint64 } -type offsetsMap map[int64]string -type moduleOffsets map[uint64]offsetsMap +type moduleOffsets map[uint64]*SymbolTree type GPUKernelLaunchInfo bpfGpuKernelLaunchT type GPUMallocInfo bpfGpuMallocT @@ -269,7 +268,7 @@ func (p *Tracer) readGPUKernelLaunchIntoSpan(record *ringbuf.Record) (request.Sp } // Log the GPU Kernel Launch event - p.log.Debug("GPU Kernel Launch", "event", event) + p.log.Info("GPU Kernel Launch", "event", event) // Find the symbol for the kernel launch symbol, ok := p.symForAddr(int32(event.PidInfo.UserPid), event.PidInfo.Ns, event.KernFuncOff) @@ -288,27 +287,24 @@ func (p *Tracer) callStack(event *GPUKernelLaunchInfo) string { if event.UstackSz > 1 { cs := []string{} - for i := 1; i < int(event.UstackSz); i++ { + for i := 0; i < int(event.UstackSz); i++ { addr := event.Ustack[i] if addr != 0 { - symbol, ok := p.symForAddr(int32(event.PidInfo.UserPid), event.PidInfo.Ns, event.KernFuncOff) - if !ok { - symbol = "" - } else { + symbol, ok := p.symForAddr(int32(event.PidInfo.UserPid), event.PidInfo.Ns, addr) + if ok { symbol = p.symToName(symbol) + cs = append(cs, symbol) } - - cs = append(cs, symbol) } } - return strings.Join(cs, " <- ") + return strings.Join(cs, ";") } return "" } -func (p *Tracer) processCudaLibFileInfo(info *exec.FileInfo, lib string, maps []*procfs.ProcMap, symMods moduleOffsets) (map[int64]string, *procfs.ProcMap, bool) { +func (p *Tracer) processCudaLibFileInfo(info *exec.FileInfo, lib string, maps []*procfs.ProcMap, symMods moduleOffsets) (*SymbolTree, *procfs.ProcMap, bool) { cudaMap := exec.LibPathPlain(lib, maps) if cudaMap == nil { @@ -379,6 +375,11 @@ func (p *Tracer) processCudaFileInfo(info *exec.FileInfo) { disovered = append(disovered, mod) } } + if strings.Contains(m.Pathname, "/ggml") { + if mod := p.discoverModule(info, maps, symModules, m.Pathname); mod != nil { + disovered = append(disovered, mod) + } + } } p.log.Debug("Processing cuda symbol map for", "inode", info.Ino) @@ -475,8 +476,11 @@ func (p *Tracer) symForAddr(pid int32, ns uint32, off uint64) (string, bool) { if off > m.base && off < m.end { modSyms, ok := syms[m.ino] if ok { - sym, ok := modSyms[int64(off)-int64(m.base)] - return sym, ok + res := modSyms.Search(off - m.base) + if len(res) > 0 { + return res[0].Symbol, true + } + return "", false } else { p.log.Warn("Can't find mod sym for", "ino", m.ino) } @@ -486,14 +490,14 @@ func (p *Tracer) symForAddr(pid int32, ns uint32, off uint64) (string, bool) { return "", false } -func (p *Tracer) collectSymbols(f *elf.File, syms []elf.Symbol, addressToName map[int64]string) { +func (p *Tracer) collectSymbols(f *elf.File, syms []elf.Symbol, tree *SymbolTree) { for _, s := range syms { if elf.ST_TYPE(s.Info) != elf.STT_FUNC { // Symbol not associated with a function or other executable code. continue } - address := int64(s.Value) + address := s.Value // Loop over ELF segments. for _, prog := range f.Progs { // Skip uninteresting segments. @@ -502,30 +506,32 @@ func (p *Tracer) collectSymbols(f *elf.File, syms []elf.Symbol, addressToName ma } if prog.Vaddr <= s.Value && s.Value < (prog.Vaddr+prog.Memsz) { - address = int64(s.Value) - int64(prog.Vaddr) + address = s.Value - prog.Vaddr break } } - addressToName[address] = s.Name + if address != 0 { + tree.Insert(Symbol{Low: address, High: address + s.Size, Symbol: s.Name}) + } } } // returns a map of symbol addresses to names -func (p *Tracer) findSymbolAddresses(f *elf.File) (map[int64]string, error) { - addressToName := map[int64]string{} +func (p *Tracer) findSymbolAddresses(f *elf.File) (*SymbolTree, error) { + t := SymbolTree{} syms, err := f.Symbols() if err != nil && !errors.Is(err, elf.ErrNoSymbols) { return nil, err } - p.collectSymbols(f, syms, addressToName) + p.collectSymbols(f, syms, &t) dynsyms, err := f.DynamicSymbols() if err != nil && !errors.Is(err, elf.ErrNoSymbols) { return nil, err } - p.collectSymbols(f, dynsyms, addressToName) + p.collectSymbols(f, dynsyms, &t) - return addressToName, nil + return &t, nil } diff --git a/pkg/internal/ebpf/gpuevent/symboltree.go b/pkg/internal/ebpf/gpuevent/symboltree.go new file mode 100644 index 000000000..8d388058e --- /dev/null +++ b/pkg/internal/ebpf/gpuevent/symboltree.go @@ -0,0 +1,74 @@ +package gpuevent + +// Interval represents an interval with a low and high value +type Symbol struct { + Low, High uint64 + Symbol string +} + +// Node represents a node in the interval tree +type Node struct { + Symbol Symbol + Max uint64 + Left *Node + Right *Node +} + +// SymbolTree represents the interval tree +type SymbolTree struct { + Root *Node +} + +// NewSymbolTree creates a new interval tree +func NewSymbolTree() *SymbolTree { + return &SymbolTree{} +} + +// Insert inserts a new interval into the interval tree +func (t *SymbolTree) Insert(sym Symbol) { + t.Root = insert(t.Root, sym) +} + +func insert(root *Node, sym Symbol) *Node { + if root == nil { + return &Node{ + Symbol: sym, + Max: sym.High, + } + } + + if sym.Low < root.Symbol.Low { + root.Left = insert(root.Left, sym) + } else { + root.Right = insert(root.Right, sym) + } + + if root.Max < sym.High { + root.Max = sym.High + } + + return root +} + +// Search searches for intervals that overlap with the given point +func (t *SymbolTree) Search(point uint64) []Symbol { + var result []Symbol + search(t.Root, point, &result) + return result +} + +func search(root *Node, point uint64, result *[]Symbol) { + if root == nil { + return + } + + if root.Symbol.Low <= point && point < root.Symbol.High { + *result = append(*result, root.Symbol) + } + + if root.Left != nil && root.Left.Max >= point { + search(root.Left, point, result) + } + + search(root.Right, point, result) +} diff --git a/pkg/internal/ebpf/gpuevent/symboltree_test.go b/pkg/internal/ebpf/gpuevent/symboltree_test.go new file mode 100644 index 000000000..fb9ac1653 --- /dev/null +++ b/pkg/internal/ebpf/gpuevent/symboltree_test.go @@ -0,0 +1,18 @@ +package gpuevent + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSymbolTree(t *testing.T) { + tr := &SymbolTree{} + + tr.Insert(Symbol{Low: 100, High: 200, Symbol: "test"}) + tr.Insert(Symbol{Low: 200, High: 300, Symbol: "test2"}) + + r := tr.Search(200) + assert.Equal(t, 1, len(r)) + assert.Equal(t, "test2", r[0].Symbol) +}