-
Notifications
You must be signed in to change notification settings - Fork 154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve watcher and TestWatcher_AgentErrorQuick logs #5345
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -93,14 +93,14 @@ func (ch *AgentWatcher) Run(ctx context.Context) { | |||||
if failedErr == nil { | ||||||
flipFlopCount++ | ||||||
failedTimer.Reset(ch.checkInterval) | ||||||
ch.log.Error("Agent reported failure (starting failed timer): %s", err) | ||||||
ch.log.Errorf("Agent reported failure (starting failed timer): %s", err) | ||||||
} else { | ||||||
ch.log.Error("Agent reported failure (failed timer already started): %s", err) | ||||||
ch.log.Errorf("Agent reported failure (failed timer already started): %s", err) | ||||||
} | ||||||
} else { | ||||||
if failedErr != nil { | ||||||
failedTimer.Stop() | ||||||
ch.log.Error("Agent reported healthy (failed timer stopped): %s", err) | ||||||
ch.log.Info("Agent reported healthy (failed timer stopped)") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we not going to lose useful info by ommiting the error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really want to add this as an Info log instead of an Error log? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the else branch of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see now, sorry, I got confused by the failedErr != nil check. In that case couldn't we log failedErr? Does that have any meaning to the error message? |
||||||
} | ||||||
} | ||||||
failedErr = err | ||||||
|
@@ -115,7 +115,8 @@ func (ch *AgentWatcher) Run(ctx context.Context) { | |||||
continue | ||||||
} | ||||||
// error lasted longer than the checkInterval, notify! | ||||||
ch.notifyChan <- failedErr | ||||||
ch.notifyChan <- fmt.Errorf("last error was not cleared before checkInterval (%s) elapsed: %w", | ||||||
ch.checkInterval, failedErr) | ||||||
} | ||||||
} | ||||||
}() | ||||||
|
@@ -138,7 +139,7 @@ LOOP: | |||||
connectCancel() | ||||||
if err != nil { | ||||||
ch.connectCounter++ | ||||||
ch.log.Error("Failed connecting to running daemon: ", err) | ||||||
ch.log.Errorf("Failed connecting to running daemon: %s", err) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
if ch.checkFailures() { | ||||||
return | ||||||
} | ||||||
|
@@ -152,7 +153,7 @@ LOOP: | |||||
// considered a connect error | ||||||
stateCancel() | ||||||
ch.agentClient.Disconnect() | ||||||
ch.log.Error("Failed to start state watch: ", err) | ||||||
ch.log.Errorf("Failed to start state watch: %s", err) | ||||||
ch.connectCounter++ | ||||||
if ch.checkFailures() { | ||||||
return | ||||||
|
@@ -178,25 +179,30 @@ LOOP: | |||||
for { | ||||||
state, err := watch.Recv() | ||||||
if err != nil { | ||||||
ch.log.Debugf("received state: error: %s", | ||||||
err) | ||||||
|
||||||
// agent has crashed or exited | ||||||
stateCancel() | ||||||
ch.agentClient.Disconnect() | ||||||
ch.log.Error("Lost connection: failed reading next state: ", err) | ||||||
ch.log.Errorf("Lost connection: failed reading next state: %s", err) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick, maybe we can get a clearer message by offering context of what we were doing when we received the error followed by the error itself (if zap supports %w as @mauri870 said it could also be worth using it here)
Suggested change
|
||||||
ch.lostCounter++ | ||||||
if ch.checkFailures() { | ||||||
return | ||||||
} | ||||||
continue LOOP | ||||||
} | ||||||
ch.log.Debugf("received state: %s:%s", | ||||||
state.State, state.Message) | ||||||
|
||||||
// gRPC is good at hiding the fact that connection was lost | ||||||
// to ensure that we don't miss a restart a changed PID means | ||||||
// we are now talking to a different spawned Elastic Agent | ||||||
if ch.lastPid == -1 { | ||||||
ch.lastPid = state.Info.PID | ||||||
ch.log.Info(fmt.Sprintf("Communicating with PID %d", ch.lastPid)) | ||||||
ch.log.Infof("Communicating with PID %d", ch.lastPid) | ||||||
} else if ch.lastPid != state.Info.PID { | ||||||
ch.log.Error(fmt.Sprintf("Communication with PID %d lost, now communicating with PID %d", ch.lastPid, state.Info.PID)) | ||||||
ch.log.Errorf("Communication with PID %d lost, now communicating with PID %d", ch.lastPid, state.Info.PID) | ||||||
ch.lastPid = state.Info.PID | ||||||
// count the PID change as a lost connection, but allow | ||||||
// the communication to continue unless has become a failure | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -270,14 +270,25 @@ func TestWatcher_AgentError(t *testing.T) { | |
} | ||
|
||
func TestWatcher_AgentErrorQuick(t *testing.T) { | ||
// test tests for success, which only happens when no error comes in | ||
// during this time period | ||
// Success only happens when no error comes in during this time period | ||
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) | ||
defer cancel() | ||
|
||
errCh := make(chan error) | ||
logger, _ := logger.NewTesting("watcher") | ||
w := NewAgentWatcher(errCh, logger, 100*time.Millisecond) | ||
log, obs := logger.NewTesting("watcher") | ||
defer func() { | ||
if t.Failed() { | ||
rawLogs := obs.All() | ||
for _, rawLog := range rawLogs { | ||
msg := fmt.Sprintf("[%s] %s", rawLog.Level, rawLog.Message) | ||
for k, v := range rawLog.ContextMap() { | ||
msg += fmt.Sprintf("%s=%v", k, v) | ||
} | ||
t.Log(msg) | ||
} | ||
} | ||
Comment on lines
+280
to
+289
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll add a helper function together with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
}() | ||
w := NewAgentWatcher(errCh, log, 100*time.Millisecond) | ||
|
||
// reports an error state, followed by a healthy state (should not error) | ||
mockHandler := func(srv cproto.ElasticAgentControl_StateWatchServer) error { | ||
|
@@ -302,7 +313,7 @@ func TestWatcher_AgentErrorQuick(t *testing.T) { | |
return nil | ||
} | ||
mock := &mockDaemon{watch: mockHandler} | ||
require.NoError(t, mock.Start()) | ||
require.NoError(t, mock.Start(), "could not start mock agent daemon") | ||
defer mock.Stop() | ||
|
||
// set client to mock; before running | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NIT: I think Zap supports %w behind the scenes. Would it be better to use %w for the errors? I know they won't be unwrapped, but at least it would be a clear visual indicator that you're formatting an error type.