Skip to content

Commit

Permalink
fix flaky TestCoordinatorReportsInvalidPolicy (#4402) (#4465)
Browse files Browse the repository at this point in the history
* increase state change timeout, fail fast and better error messages
* print coordinator logs on failure

(cherry picked from commit fc67134)

Co-authored-by: Anderson Queiroz <anderson.queiroz@elastic.co>
  • Loading branch information
mergify[bot] and AndersonQ authored Mar 22, 2024
1 parent 818d204 commit 1eb18c5
Showing 1 changed file with 23 additions and 12 deletions.
35 changes: 23 additions & 12 deletions internal/pkg/agent/application/coordinator/coordinator_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/elastic/elastic-agent/pkg/component"
"github.com/elastic/elastic-agent/pkg/component/runtime"
agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client"
"github.com/elastic/elastic-agent/pkg/core/logger"
"github.com/elastic/elastic-agent/pkg/utils/broadcaster"
)

Expand Down Expand Up @@ -331,22 +332,31 @@ func TestCoordinatorReportsInvalidPolicy(t *testing.T) {
// does let's report a failure instead of timing out the test runner.
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
logger := logp.NewLogger("testing")

log, obs := logger.NewTesting("")
defer func() {
if t.Failed() {
t.Log("test failed, coordinator logs below:")
for _, l := range obs.TakeAll() {
t.Log(l)
}
}
}()

upgradeMgr, err := upgrade.NewUpgrader(
logger,
log,
&artifact.Config{},
&info.AgentInfo{},
)
require.NoError(t, err)
require.NoError(t, err, "errored when creating a new upgrader")

// Channels have buffer length 1, so we don't have to run on multiple
// goroutines.
stateChan := make(chan State, 1)
configChan := make(chan ConfigChange, 1)
varsChan := make(chan []*transpiler.Vars, 1)
coord := &Coordinator{
logger: logger,
logger: log,
state: State{
CoordinatorState: agentclient.Healthy,
CoordinatorMessage: "Running",
Expand Down Expand Up @@ -390,19 +400,18 @@ agent.download.sourceURI:
"failed to reload upgrade manager configuration",
"configErr should match policy failure, got %v", coord.configErr)

stateChangeTimeout := 2 * time.Second
select {
case state := <-stateChan:
assert.Equal(t, agentclient.Failed, state.State, "Failed policy change should cause Failed coordinator state")
assert.Contains(t, state.Message, cfgChange.err.Error(), "Coordinator state should report failed policy change")

// The component model update happens on a goroutine, thus the new state
// might not have been sent yet. Therefore, a timeout is required.
case <-time.After(time.Second):
assert.Fail(t, "Coordinator's state didn't change")
case <-time.After(stateChangeTimeout):
t.Fatalf("timedout after %s waiting Coordinator's state to change", stateChangeTimeout)
}

// isn't this another test?

// Send an empty vars update. This should regenerate the component model
// based on the last good (empty) policy, producing a "successful" update,
// but the overall reported state should still be Failed because the last
Expand All @@ -420,8 +429,9 @@ agent.download.sourceURI:

// The component model update happens on a goroutine, thus the new state
// might not have been sent yet. Therefore, a timeout is required.
case <-time.After(time.Second):
assert.Fail(t, "Vars change should cause state update")
case <-time.After(stateChangeTimeout):
t.Fatalf("timedout after %s waiting Vars change to cause a state update",
stateChangeTimeout)
}

// Finally, send an empty (valid) policy update and confirm that it
Expand All @@ -439,8 +449,9 @@ agent.download.sourceURI:

// The component model update happens on a goroutine, thus the new state
// might not have been sent yet. Therefore, a timeout is required.
case <-time.After(time.Second):
assert.Fail(t, "Policy change should cause state update")
case <-time.After(stateChangeTimeout):
t.Fatalf("timedout after %s waiting Policy change to cause a state update",
stateChangeTimeout)
}
}

Expand Down

0 comments on commit 1eb18c5

Please sign in to comment.