Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix state store SetAction panic #5438

Merged
merged 7 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Fix the Elastic Agent crashing when self unenrolling due to too many authentication failures against Fleet Server

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; a word indicating the component this changeset affects.
component: state-store

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
#pr: https://github.com/owner/repo/1234

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/5434
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,10 @@ func (h *Unenroll) Handle(ctx context.Context, a fleetapi.Action, acker acker.Ac
unenrollPolicy := newPolicyChange(ctx, config.New(), a, acker, true)
h.ch <- unenrollPolicy

if h.stateStore != nil {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to have been used just to test the handler without a state store, which is one of the reasons the panic was not caught

// backup action for future start to avoid starting fleet gateway loop
h.stateStore.SetAction(a)
if err := h.stateStore.Save(); err != nil {
h.log.Warnf("Failed to update state store: %v", err)
}
// backup action for future start to avoid starting fleet gateway loop
h.stateStore.SetAction(a)
if err := h.stateStore.Save(); err != nil {
h.log.Warnf("Failed to update state store: %v", err)
}

unenrollCtx, cancel := context.WithTimeout(ctx, unenrollTimeout)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@ package handlers

import (
"context"
"path/filepath"
"sync/atomic"
"testing"

"github.com/elastic/elastic-agent-client/v7/pkg/client"
"github.com/elastic/elastic-agent-client/v7/pkg/proto"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator"
"github.com/elastic/elastic-agent/internal/pkg/agent/storage"
"github.com/elastic/elastic-agent/internal/pkg/agent/storage/store"
"github.com/elastic/elastic-agent/internal/pkg/fleetapi"
"github.com/elastic/elastic-agent/pkg/component"
"github.com/elastic/elastic-agent/pkg/component/runtime"
Expand Down Expand Up @@ -105,7 +108,13 @@ func TestActionUnenrollHandler(t *testing.T) {
}
}()

handler := NewUnenroll(log, coord, ch, nil, nil)
storePath := filepath.Join(t.TempDir(), "state.json")
s, err := storage.NewDiskStore(storePath)
require.NoError(t, err, "failed creating DiskStore")

st, err := store.NewStateStore(log, s)
require.NoError(t, err)
handler := NewUnenroll(log, coord, ch, nil, st)

getTamperProtectionFunc := func(enabled bool) func() bool {
return func() bool {
Expand All @@ -119,9 +128,14 @@ func TestActionUnenrollHandler(t *testing.T) {
wantErr error // Handler error
wantPerformedActions int64
tamperProtectionFn func() bool
autoUnenroll bool
}{
{
name: "no running components",
name: "fleet unenroll with no running components",
},
{
name: "auto unenroll with no running components",
autoUnenroll: true,
},
{
name: "endpoint no dispatch",
Expand Down Expand Up @@ -200,10 +214,17 @@ func TestActionUnenrollHandler(t *testing.T) {
handler.tamperProtectionFn = tc.tamperProtectionFn
}

err := handler.Handle(ctx, action, acker)
a := new(fleetapi.ActionUnenroll)
*a = *action
if tc.autoUnenroll {
a.IsDetected = true
}
err := handler.Handle(ctx, a, acker)

require.ErrorIs(t, err, tc.wantErr)
if tc.wantErr == nil {

// autoUnenroll isn't acked
if tc.wantErr == nil && !tc.autoUnenroll {
require.Len(t, acker.Acked, 1)
}
require.Equal(t, tc.wantPerformedActions, coord.performedActions.Load())
Expand Down
13 changes: 13 additions & 0 deletions internal/pkg/agent/storage/store/state_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"encoding/json"
"fmt"
"io"
"reflect"
"sync"

"github.com/elastic/elastic-agent/internal/pkg/agent/storage"
Expand Down Expand Up @@ -169,6 +170,18 @@ func (s *StateStore) SetAction(a fleetapi.Action) {
s.mx.Lock()
defer s.mx.Unlock()

// the reflect.ValueOf(v).IsNil() is required as the type of 'v' on switch
// clause with multiple types will, in this case, preserve the original type.
// See details on https://go.dev/ref/spec#Type_switches
// Without using reflect accessing the concrete type stored in the interface
// isn't possible and as a is of type fleetapi.Action and has a concrete
// value, a is never nil, neither v is nil as it has the same type of a
// on both clauses.
if a == nil || reflect.ValueOf(a).IsNil() {
s.log.Debugf("trying to set an nil '%T' action, ignoring the action", a)
return
}

switch v := a.(type) {
// If any new action type is added, don't forget to update the method's
// description.
Expand Down
90 changes: 88 additions & 2 deletions internal/pkg/agent/storage/store/state_store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
"github.com/elastic/elastic-agent/internal/pkg/agent/storage"
"github.com/elastic/elastic-agent/internal/pkg/agent/vault"
"github.com/elastic/elastic-agent/internal/pkg/fleetapi"
"github.com/elastic/elastic-agent/pkg/core/logger"
"github.com/elastic/elastic-agent/pkg/core/logger/loggertest"
)

type wrongAction struct{}
Expand Down Expand Up @@ -61,7 +61,93 @@ func createAgentVaultAndSecret(t *testing.T, ctx context.Context, tempDir string
}

func runTestStateStore(t *testing.T, ackToken string) {
log, _ := logger.New("state_store", false)
log, _ := loggertest.New("state_store")

t.Run("SetAction corner case", func(t *testing.T) {

t.Run("nil fleetapi.Action", func(t *testing.T) {
var action fleetapi.Action

storePath := filepath.Join(t.TempDir(), "state.json")
s, err := storage.NewDiskStore(storePath)
require.NoError(t, err, "failed creating DiskStore")

store, err := NewStateStore(log, s)
require.NoError(t, err)
require.Nil(t, store.Action())

store.SetAction(action)
store.SetAckToken(ackToken)
err = store.Save()
require.NoError(t, err)

assert.Empty(t, store.Action())
assert.Empty(t, store.Queue())
assert.Equal(t, ackToken, store.AckToken())
})

t.Run("nil concrete and accepted action", func(t *testing.T) {
var actionUnenroll *fleetapi.ActionUnenroll
actionPolicyChange := &fleetapi.ActionPolicyChange{
ActionID: "abc123",
}

storePath := filepath.Join(t.TempDir(), "state.json")
s, err := storage.NewDiskStore(storePath)
require.NoError(t, err, "failed creating DiskStore")

store, err := NewStateStore(log, s)
require.NoError(t, err)
require.Nil(t, store.Action())

// 1st set an action
store.SetAction(actionPolicyChange)
store.SetAckToken(ackToken)
err = store.Save()
require.NoError(t, err)

// then try to set a nil action
store.SetAction(actionUnenroll)
store.SetAckToken(ackToken)
err = store.Save()
require.NoError(t, err)

assert.Equal(t, actionPolicyChange, store.Action())
assert.Empty(t, store.Queue())
assert.Equal(t, ackToken, store.AckToken())
})

t.Run("nil concrete and ignored action", func(t *testing.T) {
var actionUnknown *fleetapi.ActionUnknown
actionPolicyChange := &fleetapi.ActionPolicyChange{
ActionID: "abc123",
}

storePath := filepath.Join(t.TempDir(), "state.json")
s, err := storage.NewDiskStore(storePath)
require.NoError(t, err, "failed creating DiskStore")

store, err := NewStateStore(log, s)
require.NoError(t, err)
require.Nil(t, store.Action())

// 1st set an action
store.SetAction(actionPolicyChange)
store.SetAckToken(ackToken)
err = store.Save()
require.NoError(t, err)

// then try to set a nil action
store.SetAction(actionUnknown)
store.SetAckToken(ackToken)
err = store.Save()
require.NoError(t, err)

assert.Equal(t, actionPolicyChange, store.Action())
assert.Empty(t, store.Queue())
assert.Equal(t, ackToken, store.AckToken())
})
})

t.Run("store is not dirty on successful save", func(t *testing.T) {
storePath := filepath.Join(t.TempDir(), "state.json")
Expand Down