Skip to content

Commit

Permalink
Catch and log panics (elastic#6199)
Browse files Browse the repository at this point in the history
When a Beat exits due to panic the stack trace is written to stderr. For Beats running under service managers where the stdout/err are discarded this makes it problematic to debug (or even detect a panic).

This commit modifies the beat runner to catch (aka `recover`) the panicking main goroutine, log it with logp, flush the logger, then exit. This ensures that the panic is present in the log output.

Under normal circumstances (no error) the final log message from every Beat will be `<beatname> stopped.`. When there is a panic a l`evel=fatal` log line will be written that contains the panic message and a stack trace. For example:

```
{
  "level": "info",
  "timestamp": "2018-01-26T13:36:59.708-0500",
  "caller": "runtime/asm_amd64.s:510",
  "message": "auditbeat stopped."
}
{
  "level": "fatal",
  "timestamp": "2018-01-26T13:36:59.709-0500",
  "logger": "auditbeat",
  "caller": "instance/beat.go:134",
  "message": "Failed due to panic.",
  "panic": "Oh no, big problem!",
  "stack": "github.com/elastic/beats/libbeat/cmd/instance.Run.func1.1\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/instance/beat.go:134\nruntime.call32\n\t/Users/akroh/.gvm/versions/go1.9.2.darwin.amd64/src/runtime/asm_amd64.s:509\nruntime.gopanic\n\t/Users/akroh/.gvm/versions/go1.9.2.darwin.amd64/src/runtime/panic.go:491\ngithub.com/elastic/beats/libbeat/cmd/instance.(*Beat).launch\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/instance/beat.go:315\ngithub.com/elastic/beats/libbeat/cmd/instance.Run.func1\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/instance/beat.go:142\ngithub.com/elastic/beats/libbeat/cmd/instance.Run\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/instance/beat.go:143\ngithub.com/elastic/beats/libbeat/cmd.genRunCmd.func1\n\t/Users/akroh/go/src/github.com/elastic/beats/libbeat/cmd/run.go:19\ngithub.com/elastic/beats/vendor/github.com/spf13/cobra.(*Command).execute\n\t/Users/akroh/go/src/github.com/elastic/beats/vendor/github.com/spf13/cobra/command.go:704\ngithub.com/elastic/beats/vendor/github.com/spf13/cobra.(*Command).ExecuteC\n\t/Users/akroh/go/src/github.com/elastic/beats/vendor/github.com/spf13/cobra/command.go:785\ngithub.com/elastic/beats/vendor/github.com/spf13/cobra.(*Command).Execute\n\t/Users/akroh/go/src/github.com/elastic/beats/vendor/github.com/spf13/cobra/command.go:738\nmain.main\n\t/Users/akroh/go/src/github.com/elastic/beats/auditbeat/main.go:14\nruntime.main\n\t/Users/akroh/.gvm/versions/go1.9.2.darwin.amd64/src/runtime/proc.go:195"
}
```

Log that panic object seperately
  • Loading branch information
andrewkroh authored and ruflin committed Jan 30, 2018
1 parent 847b6ac commit d9b1237
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ https://github.com/elastic/beats/compare/v6.0.0-beta2...master[Check the HEAD di
- Add the abilility for the add_docker_metadata process to enrich based on process ID. {pull}6100[6100]
- The `add_docker_metadata` and `add_kubernetes_metadata` processors are now GA, instead of Beta. {pull}6105[6105]
- The node name can be discovered automatically by machine-id matching when beat deployed outside kubernetes cluster. {pull}6146[6146]
- Panics will be written to the logger before exiting. {pull}6199[6199]

*Auditbeat*

Expand Down
9 changes: 8 additions & 1 deletion libbeat/cmd/instance/beat.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"time"

"github.com/satori/go.uuid"
"go.uber.org/zap"

"github.com/elastic/beats/libbeat/api"
"github.com/elastic/beats/libbeat/beat"
Expand Down Expand Up @@ -128,6 +129,12 @@ func initRand() {
// XXX Move this as a *Beat method?
func Run(name, idxPrefix, version string, bt beat.Creator) error {
return handleError(func() error {
defer func() {
if r := recover(); r != nil {
logp.NewLogger(name).Fatalw("Failed due to panic.",
"panic", r, zap.Stack("stack"))
}
}()
b, err := NewBeat(name, idxPrefix, version)
if err != nil {
return err
Expand Down Expand Up @@ -248,6 +255,7 @@ func (b *Beat) createBeater(bt beat.Creator) (beat.Beater, error) {

func (b *Beat) launch(bt beat.Creator) error {
defer logp.Sync()
defer logp.Info("%s stopped.", b.Info.Beat)

err := b.Init()
if err != nil {
Expand Down Expand Up @@ -299,7 +307,6 @@ func (b *Beat) launch(bt beat.Creator) error {
}

logp.Info("%s start running.", b.Info.Beat)
defer logp.Info("%s stopped.", b.Info.Beat)

if b.Config.HTTP.Enabled() {
api.Start(b.Config.HTTP, b.Info)
Expand Down
7 changes: 4 additions & 3 deletions libbeat/logp/core_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func TestLoggerLevel(t *testing.T) {
}

func TestRecover(t *testing.T) {
const recoveryExplanation = "Something went wrong."
const recoveryExplanation = "Something went wrong"
const cause = "unexpected condition"

DevelopmentSetup(ToObserverOutput())
Expand All @@ -160,8 +160,9 @@ func TestRecover(t *testing.T) {
assert.Equal(t, zap.ErrorLevel, log.Level)
assert.Equal(t, "logp/core_test.go",
strings.Split(log.Caller.TrimmedPath(), ":")[0])
assert.Contains(t, log.Message, recoveryExplanation)
assert.Contains(t, log.Message, cause)
assert.Contains(t, log.Message, recoveryExplanation+
". Recovering, but please report this.")
assert.Contains(t, log.ContextMap(), "panic")
}
}()

Expand Down
4 changes: 2 additions & 2 deletions libbeat/logp/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ func WTF(format string, v ...interface{}) {
// Recover stops a panicking goroutine and logs an Error.
func Recover(msg string) {
if r := recover(); r != nil {
msg := fmt.Sprintf("%s. Recovering, but please report this: %s.", msg, r)
msg := fmt.Sprintf("%s. Recovering, but please report this.", msg)
globalLogger().WithOptions(zap.AddCallerSkip(2)).
Error(msg, zap.Stack("stack"))
Error(msg, zap.Any("panic", r), zap.Stack("stack"))
}
}
19 changes: 19 additions & 0 deletions libbeat/logp/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ func (l *Logger) Error(args ...interface{}) {
l.sugar.Error(args...)
}

// Fatal uses fmt.Sprint to construct and log a message, then calls os.Exit(1).
func (l *Logger) Fatal(args ...interface{}) {
l.sugar.Fatal(args...)
}

// Panic uses fmt.Sprint to construct and log a message, then panics.
func (l *Logger) Panic(args ...interface{}) {
l.sugar.Panic(args...)
Expand Down Expand Up @@ -84,6 +89,11 @@ func (l *Logger) Errorf(format string, args ...interface{}) {
l.sugar.Errorf(format, args...)
}

// Fatalf uses fmt.Sprintf to log a templated message, then calls os.Exit(1).
func (l *Logger) Fatalf(format string, args ...interface{}) {
l.sugar.Fatalf(format, args...)
}

// Panicf uses fmt.Sprintf to log a templated message, then panics.
func (l *Logger) Panicf(format string, args ...interface{}) {
l.sugar.Panicf(format, args...)
Expand Down Expand Up @@ -129,6 +139,15 @@ func (l *Logger) Errorw(msg string, keysAndValues ...interface{}) {
l.sugar.Errorw(msg, keysAndValues...)
}

// Fatalw logs a message with some additional context, then calls os.Exit(1).
// The additional context is added in the form of key-value pairs. The optimal
// way to write the value to the log message will be inferred by the value's
// type. To explicitly specify a type you can pass a Field such as
// logp.Stringer.
func (l *Logger) Fatalw(msg string, keysAndValues ...interface{}) {
l.sugar.Fatalw(msg, keysAndValues...)
}

// Panicw logs a message with some additional context, then panics. The
// additional context is added in the form of key-value pairs. The optimal way
// to write the value to the log message will be inferred by the value's type.
Expand Down

0 comments on commit d9b1237

Please sign in to comment.