Skip to content

Commit

Permalink
[8.15](backport #5420) [Flaky Test] TestComponentBuildHashInDiagnosti…
Browse files Browse the repository at this point in the history
…cs improve agent state check (#5435)

* [Flaky Test] TestComponentBuildHashInDiagnostics improve agent state check (#5420)

ensure the agent status has components, all components are healthy and the version info is up-to-date

(cherry picked from commit 116e73f)

* manually backport de3dec4

---------

Co-authored-by: Anderson Queiroz <anderson.queiroz@elastic.co>
  • Loading branch information
mergify[bot] and AndersonQ authored Sep 9, 2024
1 parent fc28030 commit 179ffdd
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 14 deletions.
22 changes: 17 additions & 5 deletions pkg/testing/fixture_install.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ func getProcesses(t *gotesting.T, regex string) []runningProcess {
// - an error if any.
func (f *Fixture) installDeb(ctx context.Context, installOpts *InstallOpts, opts []process.CmdOption) ([]byte, error) {
f.t.Logf("[test %s] Inside fixture installDeb function", f.t.Name())
//Prepare so that the f.srcPackage string is populated
// Prepare so that the f.srcPackage string is populated
err := f.EnsurePrepared(ctx)
if err != nil {
return nil, fmt.Errorf("failed to prepare: %w", err)
Expand Down Expand Up @@ -483,7 +483,7 @@ func (f *Fixture) installDeb(ctx context.Context, installOpts *InstallOpts, opts
// - an error if any.
func (f *Fixture) installRpm(ctx context.Context, installOpts *InstallOpts, opts []process.CmdOption) ([]byte, error) {
f.t.Logf("[test %s] Inside fixture installRpm function", f.t.Name())
//Prepare so that the f.srcPackage string is populated
// Prepare so that the f.srcPackage string is populated
err := f.EnsurePrepared(ctx)
if err != nil {
return nil, fmt.Errorf("failed to prepare: %w", err)
Expand Down Expand Up @@ -649,12 +649,12 @@ func (f *Fixture) collectDiagnostics() {
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()

dir, err := findProjectRoot(f.caller)
diagPath, err := f.DiagDir()
if err != nil {
f.t.Logf("failed to collect diagnostics; failed to find project root: %s", err)
f.t.Logf("failed to collect diagnostics: %v", err)
return
}
diagPath := filepath.Join(dir, "build", "diagnostics")

err = os.MkdirAll(diagPath, 0755)
if err != nil {
f.t.Logf("failed to collect diagnostics; failed to create %s: %s", diagPath, err)
Expand Down Expand Up @@ -699,6 +699,18 @@ func (f *Fixture) collectDiagnostics() {
}
}

// DiagDir returned {projectRoot}/build/diagnostics path. Files on this path
// are saved if any test fails. Use it to save files for further investigation.
func (f *Fixture) DiagDir() (string, error) {
dir, err := findProjectRoot(f.caller)
if err != nil {
return "", fmt.Errorf("failed to find project root: %w", err)
}

diagPath := filepath.Join(dir, "build", "diagnostics")
return diagPath, nil
}

func (f *Fixture) archiveInstallDirectory(installPath string, outputPath string) error {
file, err := os.Create(outputPath)
if err != nil {
Expand Down
94 changes: 85 additions & 9 deletions testing/integration/package_version_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"context"
"encoding/json"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
Expand Down Expand Up @@ -91,27 +92,60 @@ func TestComponentBuildHashInDiagnostics(t *testing.T) {
"failed to install start agent [output: %s]", string(output))

stateBuff := bytes.Buffer{}
var status atesting.AgentStatusOutput
allHealthy := func() bool {
stateBuff.Reset()

status, err := f.ExecStatus(ctx)
status, err = f.ExecStatus(ctx)
if err != nil {
stateBuff.WriteString(fmt.Sprintf("failed to get agent status: %v",
err))
return false
}

if client.State(status.State) != client.Healthy {
stateBuff.WriteString(fmt.Sprintf(
"agent isn't healthy: %s-%s",
client.State(status.State), status.Message))
return false
}

if len(status.Components) == 0 {
stateBuff.WriteString(fmt.Sprintf(
"healthy but without components: agent status: %s-%s",
client.State(status.State), status.Message))
return false
}

// the agent might be healthy but waiting its first configuration,
// in that case, there would be no components yet. Therefore, ensure
// the agent received the policy with components before proceeding with
// the test.
for _, c := range status.Components {
bs, err := json.MarshalIndent(status, "", " ")
if err != nil {
stateBuff.WriteString(fmt.Sprintf(
"%s not healthy, could not marshal status outptu: %v",
c.Name, err))
return false
}

state := client.State(c.State)
if state != client.Healthy {
bs, err := json.MarshalIndent(status, "", " ")
if err != nil {
stateBuff.WriteString(fmt.Sprintf("%s not health, could not marshal status outptu: %v",
c.Name, err))
return false
}

stateBuff.WriteString(fmt.Sprintf("%s not health, agent status output: %s",
stateBuff.WriteString(fmt.Sprintf(
"%s not health, agent status output: %s",
c.Name, bs))
return false
}

// there is a rare a race condition unlike to happen on a
// production scenario where the component is healthy but the
// version info delays to update. As the Status command and the
// diagnostics fetch this information in the same way, it guarantees
// the version info is up-to-date before proceeding with the test.
if c.VersionInfo.Meta.Commit == "" {
stateBuff.WriteString(fmt.Sprintf(
"%s health, but no versionInfo. agent status output: %s",
c.Name, bs))
return false
}
Expand All @@ -123,6 +157,13 @@ func TestComponentBuildHashInDiagnostics(t *testing.T) {
allHealthy,
5*time.Minute, 10*time.Second,
"agent never became healthy. Last status: %v", &stateBuff)
defer func() {
if !t.Failed() {
return
}

t.Logf("test failed: last status output: %#v", status)
}()

agentbeat := "agentbeat"
if runtime.GOOS == "windows" {
Expand Down Expand Up @@ -159,6 +200,28 @@ func TestComponentBuildHashInDiagnostics(t *testing.T) {

diag := t.TempDir()
extractZipArchive(t, diagZip, diag)
// if the test fails, the diagnostics used is useful for debugging.
defer func() {
if !t.Failed() {
return
}

t.Logf("the test failed: trying to save the diagnostics used on the test")
diagDir, err := f.DiagDir()
if err != nil {
t.Logf("could not get diagnostics directory to save the diagnostics used on the test")
return
}

err = os.Rename(diagZip, filepath.Join(diagDir,
fmt.Sprintf("TestComponentBuildHashInDiagnostics-used-diag-%d.zip",
time.Now().Unix())))
if err != nil {
t.Logf("could not move diagnostics used in the test to %s: %v",
diagDir, err)
return
}
}()

stateFilePath := filepath.Join(diag, "state.yaml")
stateYAML, err := os.Open(stateFilePath)
Expand Down Expand Up @@ -192,6 +255,19 @@ func TestComponentBuildHashInDiagnostics(t *testing.T) {
assert.Equalf(t, wantBuildHash, c.State.VersionInfo.Meta.Commit,
"component %s: VersionInfo.Meta.Commit mismatch", c.ID)
}

if t.Failed() {
_, seek := stateYAML.Seek(0, 0)
if seek != nil {
t.Logf("could not reset state.yaml offset to print it")
return
}
data, err := io.ReadAll(stateYAML)
if err != nil {
t.Logf("could not read state.yaml: %v", err)
}
t.Logf("test failed: state.yaml contents: %q", string(data))
}
}

func testVersionWithRunningAgent(runCtx context.Context, f *atesting.Fixture) func(*testing.T) {
Expand Down

0 comments on commit 179ffdd

Please sign in to comment.