Skip to content

Commit

Permalink
perf, ringbuf: add Flush for manual Read/ReadInto wakeup
Browse files Browse the repository at this point in the history
Add a method Flush which interrupts a perf or ringbuf reader and
causes it to read all data from the ring. This is very similar to
the logic we use to check for data in the ring when a deadline is
expired. The semantics of the Read function change slightly: a
caller is now guaranteed to receive an os.ErrDeadlineExceeded
which wasn't the case when the ring contained data.

Signed-off-by: Bryce Kahle <bryce.kahle@datadoghq.com>
Co-authored-by: Lorenz Bauer <lmb@isovalent.com>
  • Loading branch information
brycekahle and lmb committed Jun 24, 2024
1 parent fc4f4c5 commit 83b5a83
Show file tree
Hide file tree
Showing 7 changed files with 294 additions and 60 deletions.
103 changes: 79 additions & 24 deletions internal/epoll/poller.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
package epoll

import (
"errors"
"fmt"
"math"
"os"
"runtime"
"slices"
"sync"
"time"

"github.com/cilium/ebpf/internal"
"github.com/cilium/ebpf/internal/unix"
)

var ErrFlushed = errors.New("data was flushed")

// Poller waits for readiness notifications from multiple file descriptors.
//
// The wait can be interrupted by calling Close.
Expand All @@ -21,27 +25,48 @@ type Poller struct {
epollMu sync.Mutex
epollFd int

eventMu sync.Mutex
event *eventFd
eventMu sync.Mutex
closeEvent *eventFd
flushEvent *eventFd
}

func New() (*Poller, error) {
func New() (_ *Poller, err error) {
closeFDOnError := func(fd int) {
if err != nil {
unix.Close(fd)
}
}
closeEventFDOnError := func(e *eventFd) {
if err != nil {
e.close()
}
}

epollFd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC)
if err != nil {
return nil, fmt.Errorf("create epoll fd: %v", err)
}
defer closeFDOnError(epollFd)

p := &Poller{epollFd: epollFd}
p.event, err = newEventFd()
p.closeEvent, err = newEventFd()
if err != nil {
return nil, err
}
defer closeEventFDOnError(p.closeEvent)

p.flushEvent, err = newEventFd()
if err != nil {
unix.Close(epollFd)
return nil, err
}
defer closeEventFDOnError(p.flushEvent)

if err := p.Add(p.closeEvent.raw, 0); err != nil {
return nil, fmt.Errorf("add close eventfd: %w", err)
}

if err := p.Add(p.event.raw, 0); err != nil {
unix.Close(epollFd)
p.event.close()
return nil, fmt.Errorf("add eventfd: %w", err)
if err := p.Add(p.flushEvent.raw, 0); err != nil {
return nil, fmt.Errorf("add flush eventfd: %w", err)
}

runtime.SetFinalizer(p, (*Poller).Close)
Expand All @@ -55,8 +80,8 @@ func New() (*Poller, error) {
func (p *Poller) Close() error {
runtime.SetFinalizer(p, nil)

// Interrupt Wait() via the event fd if it's currently blocked.
if err := p.wakeWait(); err != nil {
// Interrupt Wait() via the closeEvent fd if it's currently blocked.
if err := p.wakeWaitForClose(); err != nil {
return err
}

Expand All @@ -73,9 +98,14 @@ func (p *Poller) Close() error {
p.epollFd = -1
}

if p.event != nil {
p.event.close()
p.event = nil
if p.closeEvent != nil {
p.closeEvent.close()
p.closeEvent = nil
}

if p.flushEvent != nil {
p.flushEvent.close()
p.flushEvent = nil
}

return nil
Expand Down Expand Up @@ -118,8 +148,11 @@ func (p *Poller) Add(fd int, id int) error {

// Wait for events.
//
// Returns the number of pending events or an error wrapping os.ErrClosed if
// Close is called, or os.ErrDeadlineExceeded if EpollWait timeout.
// Returns the number of pending events and any errors.
//
// - [os.ErrClosed] if interrupted by [Close].
// - [ErrFlushed] if interrupted by [Flush].
// - [os.ErrDeadlineExceeded] if deadline is reached.
func (p *Poller) Wait(events []unix.EpollEvent, deadline time.Time) (int, error) {
p.epollMu.Lock()
defer p.epollMu.Unlock()
Expand Down Expand Up @@ -154,33 +187,55 @@ func (p *Poller) Wait(events []unix.EpollEvent, deadline time.Time) (int, error)
return 0, fmt.Errorf("epoll wait: %w", os.ErrDeadlineExceeded)
}

for _, event := range events[:n] {
if int(event.Fd) == p.event.raw {
// Since we don't read p.event the event is never cleared and
for i := 0; i < n; {
event := events[i]
if int(event.Fd) == p.closeEvent.raw {
// Since we don't read p.closeEvent the event is never cleared and
// we'll keep getting this wakeup until Close() acquires the
// lock and sets p.epollFd = -1.
return 0, fmt.Errorf("epoll wait: %w", os.ErrClosed)
}
if int(event.Fd) == p.flushEvent.raw {
// read event to prevent it from continuing to wake
p.flushEvent.read()
err = ErrFlushed
events = slices.Delete(events, i, i+1)
n -= 1
continue
}
i++
}

return n, nil
return n, err
}
}

type temporaryError interface {
Temporary() bool
}

// wakeWait unblocks Wait if it's epoll_wait.
func (p *Poller) wakeWait() error {
// wakeWaitForClose unblocks Wait if it's epoll_wait.
func (p *Poller) wakeWaitForClose() error {
p.eventMu.Lock()
defer p.eventMu.Unlock()

if p.closeEvent == nil {
return fmt.Errorf("epoll wake: %w", os.ErrClosed)
}

return p.closeEvent.add(1)
}

// Flush unblocks Wait if it's epoll_wait, for purposes of reading pending samples
func (p *Poller) Flush() error {
p.eventMu.Lock()
defer p.eventMu.Unlock()

if p.event == nil {
if p.flushEvent == nil {
return fmt.Errorf("epoll wake: %w", os.ErrClosed)
}

return p.event.add(1)
return p.flushEvent.add(1)
}

// eventFd wraps a Linux eventfd.
Expand Down
24 changes: 23 additions & 1 deletion internal/epoll/poller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/cilium/ebpf/internal/unix"
"github.com/go-quicktest/qt"
)

func TestPoller(t *testing.T) {
Expand Down Expand Up @@ -101,12 +102,33 @@ func TestPollerDeadline(t *testing.T) {
}()

// Wait for the goroutine to enter the syscall.
time.Sleep(time.Second)
time.Sleep(500 * time.Microsecond)

poller.Close()
<-done
}

func TestPollerFlush(t *testing.T) {
t.Parallel()

_, poller := mustNewPoller(t)
events := make([]unix.EpollEvent, 1)

done := make(chan struct{})
go func() {
defer close(done)

_, err := poller.Wait(events, time.Time{})
qt.Check(t, qt.ErrorIs(err, ErrFlushed))
}()

// Wait for the goroutine to enter the syscall.
time.Sleep(500 * time.Microsecond)

poller.Flush()
<-done
}

func mustNewPoller(t *testing.T) (*eventFd, *Poller) {
t.Helper()

Expand Down
35 changes: 22 additions & 13 deletions perf/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ import (
)

var (
ErrClosed = os.ErrClosed
errEOR = errors.New("end of ring")
ErrClosed = os.ErrClosed
ErrFlushed = epoll.ErrFlushed
errEOR = errors.New("end of ring")
)

var perfEventHeaderSize = binary.Size(perfEventHeader{})
Expand Down Expand Up @@ -160,6 +161,8 @@ type Reader struct {
overwritable bool

bufferSize int

pendingErr error
}

// ReaderOptions control the behaviour of the user
Expand Down Expand Up @@ -318,18 +321,18 @@ func (pr *Reader) SetDeadline(t time.Time) {

// Read the next record from the perf ring buffer.
//
// The function blocks until there are at least Watermark bytes in one
// The method blocks until there are at least Watermark bytes in one
// of the per CPU buffers. Records from buffers below the Watermark
// are not returned.
//
// Records can contain between 0 and 7 bytes of trailing garbage from the ring
// depending on the input sample's length.
//
// Calling Close interrupts the function.
// Calling [Close] interrupts the method with [os.ErrClosed]. Calling [Flush]
// makes it return all records currently in the ring buffer, followed by [ErrFlushed].
//
// Returns [os.ErrDeadlineExceeded] if a deadline was set and the perf ring buffer
// was empty. Otherwise returns a record and no error, even if the deadline was
// exceeded.
// Returns [os.ErrDeadlineExceeded] if a deadline was set and after all records
// have been read from the ring.
//
// See [Reader.ReadInto] for a more efficient version of this method.
func (pr *Reader) Read() (Record, error) {
Expand All @@ -356,13 +359,13 @@ func (pr *Reader) ReadInto(rec *Record) error {
return fmt.Errorf("perf ringbuffer: %w", ErrClosed)
}

deadlineWasExceeded := false
for {
if len(pr.epollRings) == 0 {
if deadlineWasExceeded {
// All rings were empty when the deadline expired, return
if pe := pr.pendingErr; pe != nil {
// All rings have been emptied since the error occurred, return
// appropriate error.
return os.ErrDeadlineExceeded
pr.pendingErr = nil
return pe
}

// NB: The deferred pauseMu.Unlock will panic if Wait panics, which
Expand All @@ -371,10 +374,10 @@ func (pr *Reader) ReadInto(rec *Record) error {
_, err := pr.poller.Wait(pr.epollEvents, pr.deadline)
pr.pauseMu.Lock()

if errors.Is(err, os.ErrDeadlineExceeded) {
if errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, ErrFlushed) {
// We've hit the deadline, check whether there is any data in
// the rings that we've not been woken up for.
deadlineWasExceeded = true
pr.pendingErr = err
} else if err != nil {
return err
}
Expand Down Expand Up @@ -463,6 +466,12 @@ func (pr *Reader) BufferSize() int {
return pr.bufferSize
}

// Flush unblocks Read/ReadInto and successive Read/ReadInto calls will return pending samples at this point,
// until you receive a [ErrFlushed] error.
func (pr *Reader) Flush() error {
return pr.poller.Flush()
}

// NB: Has to be preceded by a call to ring.loadHead.
func (pr *Reader) readRecordFromRing(rec *Record, ring *perfEventRing) error {
defer ring.writeTail()
Expand Down
71 changes: 71 additions & 0 deletions perf/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,77 @@ func TestReaderSetDeadline(t *testing.T) {
if _, err := rd.Read(); !errors.Is(err, os.ErrDeadlineExceeded) {
t.Error("Expected os.ErrDeadlineExceeded from second Read, got:", err)
}

rd.SetDeadline(time.Now().Add(10 * time.Millisecond))
if _, err := rd.Read(); !errors.Is(err, os.ErrDeadlineExceeded) {
t.Error("Expected os.ErrDeadlineExceeded from third Read, got:", err)
}
}

func TestReaderSetDeadlinePendingEvents(t *testing.T) {
events := perfEventArray(t)

rd, err := NewReaderWithOptions(events, 4096, ReaderOptions{WakeupEvents: 2})
if err != nil {
t.Fatal(err)
}
defer rd.Close()

outputSamples(t, events, 5)

rd.SetDeadline(time.Now().Add(-time.Second))
_, rem := checkRecord(t, rd)
qt.Assert(t, qt.Equals(rem, 0), qt.Commentf("expected zero Remaining"))

outputSamples(t, events, 5)

// another sample should not be returned before we get ErrFlushed to indicate initial set of samples read
_, err = rd.Read()
if !errors.Is(err, os.ErrDeadlineExceeded) {
t.Error("Expected os.ErrDeadlineExceeded from second Read, got:", err)
}

// the second sample should now be read
_, _ = checkRecord(t, rd)
}

func TestReaderFlushPendingEvents(t *testing.T) {
testutils.LockOSThreadToSingleCPU(t)
events := perfEventArray(t)

rd, err := NewReaderWithOptions(events, 4096, ReaderOptions{WakeupEvents: 2})
if err != nil {
t.Fatal(err)
}
defer rd.Close()

outputSamples(t, events, 5)

wait := make(chan int)
go func() {
wait <- 0
_, rem := checkRecord(t, rd)
wait <- rem
}()

<-wait
time.Sleep(10 * time.Millisecond)
err = rd.Flush()
qt.Assert(t, qt.IsNil(err))

rem := <-wait
qt.Assert(t, qt.Equals(rem, 0), qt.Commentf("expected zero Remaining"))

outputSamples(t, events, 5)

// another sample should not be returned before we get ErrFlushed to indicate initial set of samples read
_, err = rd.Read()
if !errors.Is(err, ErrFlushed) {
t.Error("Expected ErrFlushed from second Read, got:", err)
}

// the second sample should now be read
_, _ = checkRecord(t, rd)
}

func outputSamples(tb testing.TB, events *ebpf.Map, sampleSizes ...byte) {
Expand Down
Loading

0 comments on commit 83b5a83

Please sign in to comment.