Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .mockery.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,11 @@ packages:
github.com/elastic/elastic-agent/internal/pkg/agent/application/info:
interfaces:
Agent:
github.com/elastic/elastic-agent/internal/pkg/agent/cmd:
interfaces:
agentWatcher:
config:
mockname: "AgentWatcher"
installationModifier:
config:
mockname: "InstallationModifier"
4 changes: 2 additions & 2 deletions NOTICE-fips.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1254,11 +1254,11 @@ SOFTWARE

--------------------------------------------------------------------------------
Dependency : github.com/elastic/elastic-agent-libs
Version: v0.20.1
Version: v0.21.0
Licence type (autodetected): Apache-2.0
--------------------------------------------------------------------------------

Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.20.1/LICENSE:
Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.21.0/LICENSE:

Apache License
Version 2.0, January 2004
Expand Down
4 changes: 2 additions & 2 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1254,11 +1254,11 @@ SOFTWARE

--------------------------------------------------------------------------------
Dependency : github.com/elastic/elastic-agent-libs
Version: v0.20.1
Version: v0.21.0
Licence type (autodetected): Apache-2.0
--------------------------------------------------------------------------------

Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.20.1/LICENSE:
Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.21.0/LICENSE:

Apache License
Version 2.0, January 2004
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: feature

# Change summary; a 80ish characters long description of the change.
summary: Preserve upgrade marker when rolling back upgrade and add rollback reason

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
description: Upgrade marker is now preserved by the watcher when performing a rollback and a new `reason`
field is added to the upgrade details structure. The reason for keeping the upgrade marker when rolling back is
to allow the rolled back agent to read the rollback reason and communicate that to user/Fleet.

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/8407

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
4 changes: 4 additions & 0 deletions control_v2.proto
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ message UpgradeDetailsMetadata {
// The deadline until when a retryable upgrade step, e.g. the download
// step, will be retried.
string retry_until = 6;

// Reason is a string that may give out more information about transitioning to the current state.
// It has been introduced initially to distinguish between manual and automatic rollbacks
string reason = 7;
}

// DiagnosticFileResult is a file result from a diagnostic result.
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ require (
github.com/elastic/cloud-on-k8s/v2 v2.0.0-20250327073047-b624240832ae
github.com/elastic/elastic-agent-autodiscover v0.9.2
github.com/elastic/elastic-agent-client/v7 v7.17.2
github.com/elastic/elastic-agent-libs v0.20.1
github.com/elastic/elastic-agent-libs v0.21.0
github.com/elastic/elastic-agent-system-metrics v0.11.16
github.com/elastic/elastic-transport-go/v8 v8.7.0
github.com/elastic/go-elasticsearch/v8 v8.18.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -522,8 +522,8 @@ github.com/elastic/elastic-agent-autodiscover v0.9.2 h1:eBmru2v66HRRHOFf89rDl9OZ
github.com/elastic/elastic-agent-autodiscover v0.9.2/go.mod h1:RNaHnOTYfNptSTQUyZYnjypxmrR5AaE6BIap/175F5c=
github.com/elastic/elastic-agent-client/v7 v7.17.2 h1:Cl2TeABqWZgW40t5fchGWT/sRk4MDDLWA0d8iHHOxLA=
github.com/elastic/elastic-agent-client/v7 v7.17.2/go.mod h1:5irRFqp6HLqtu1S+OeY0jg8x7K6PLL+DW+PwVk1vJnk=
github.com/elastic/elastic-agent-libs v0.20.1 h1:M7ZME7yctVhI9349OiG0VQ8a+RsuParA/ZUgCuctwBE=
github.com/elastic/elastic-agent-libs v0.20.1/go.mod h1:xSeIP3NtOIT4N2pPS4EyURmS1Q8mK0lWZ8Wd1Du6q3w=
github.com/elastic/elastic-agent-libs v0.21.0 h1:lt2Xc87Si0mea0BgRKZGZA30j8LEx57k7GAyiKmZP/8=
github.com/elastic/elastic-agent-libs v0.21.0/go.mod h1:xSeIP3NtOIT4N2pPS4EyURmS1Q8mK0lWZ8Wd1Du6q3w=
github.com/elastic/elastic-agent-system-metrics v0.11.16 h1:cLjuO8pE5cUwPGWUHmy1VOERmJVDaep8gY+U4YRQ5vs=
github.com/elastic/elastic-agent-system-metrics v0.11.16/go.mod h1:qiZC5p1hd8te4XVnhh7FkXdcYhxFnl5i9GJpROtf6zo=
github.com/elastic/elastic-transport-go/v8 v8.7.0 h1:OgTneVuXP2uip4BA658Xi6Hfw+PeIOod2rY3GVMGoVE=
Expand Down
25 changes: 25 additions & 0 deletions internal/pkg/agent/application/upgrade/details/details.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ type Metadata struct {
// the Fail() method of UpgradeDetails to correctly record details when
// an upgrade fails.
ErrorMsg string `json:"error_msg,omitempty" yaml:"error_msg,omitempty"`

// Reason is a string that may give out more information about transitioning to the current state. It has been
// introduced initially to distinguish between manual and automatic rollbacks
Reason string `json:"reason,omitempty" yaml:"reason,omitempty"`
}

func NewDetails(targetVersion string, initialState State, actionID string) *Details {
Expand Down Expand Up @@ -87,6 +91,27 @@ func (d *Details) SetState(s State) {
d.notifyObservers()
}

// SetStateWithReason is a convenience method to set the state of the upgrade, the metadata.reason and
// notify all observers.
// Do NOT call SetStateWithReason with StateFailed; call the Fail method instead.
func (d *Details) SetStateWithReason(s State, reason string) {
d.mu.Lock()
defer d.mu.Unlock()

d.State = s
d.Metadata.Reason = reason

// If State is something other than StateFailed, make sure to clear
// Metadata.FailedState and Metadata.ErrorMsg as those two fields
// should be set when State is set to StateFailed. See the Fail method.
if s != StateFailed {
d.Metadata.ErrorMsg = ""
d.Metadata.FailedState = ""
}

d.notifyObservers()
}

// SetDownloadProgress is a convenience method to set the download percent
// and download rate when the upgrade is in UPG_DOWNLOADING state.
func (d *Details) SetDownloadProgress(percent, rateBytesPerSecond float64) {
Expand Down
10 changes: 10 additions & 0 deletions internal/pkg/agent/application/upgrade/details/details_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

Expand All @@ -30,6 +31,15 @@ func TestDetailsSetState(t *testing.T) {
require.Equal(t, StateDownloading, det.State)
}

func TestDetailsSetStateWithReason(t *testing.T) {
det := NewDetails("99.999.9999", StateWatching, "test_action_id")
require.Equal(t, StateWatching, det.State)

det.SetStateWithReason(StateRollback, ReasonWatchFailed)
assert.Equal(t, StateRollback, det.State)
assert.Equal(t, ReasonWatchFailed, det.Metadata.Reason)
}

func TestDetailsFail(t *testing.T) {
det := NewDetails("99.999.9999", StateRequested, "test_action_id")
require.Equal(t, StateRequested, det.State)
Expand Down
7 changes: 5 additions & 2 deletions internal/pkg/agent/application/upgrade/details/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ package details

type State string

// The values of these State* constants should match those enumerated for
// upgrade_details.state in https://github.com/elastic/fleet-server/blob/main/model/openapi.yml
const (
// The values of these State* constants should match those enumerated for
// upgrade_details.state in https://github.com/elastic/fleet-server/blob/main/model/openapi.yml
StateRequested State = "UPG_REQUESTED"
StateScheduled State = "UPG_SCHEDULED"
StateDownloading State = "UPG_DOWNLOADING"
Expand All @@ -19,4 +19,7 @@ const (
StateRollback State = "UPG_ROLLBACK"
StateCompleted State = "UPG_COMPLETED"
StateFailed State = "UPG_FAILED"

// List of well-known reasons for state transitions
ReasonWatchFailed = "watch failed"
)
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/marker_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ func (mfw *MarkerFileWatcher) processMarker(currentVersion string, commit string
if marker.Details == nil {
marker.Details = details.NewDetails("unknown", details.StateRollback, marker.GetActionID())
} else if marker.Details.State == "" {
marker.Details.SetState(details.StateRollback)
marker.Details.SetStateWithReason(details.StateRollback, details.ReasonWatchFailed)
}
}

Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/rollback.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func Rollback(ctx context.Context, log *logger.Logger, c client.Client, topDirPa
}

// cleanup everything except version we're rolling back into
return Cleanup(log, topDirPath, prevVersionedHome, prevHash, true, true)
return Cleanup(log, topDirPath, prevVersionedHome, prevHash, false, true)
}

// Cleanup removes all artifacts and files related to a specified version.
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/upgrade/rollback_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ func checkFilesAfterRollback(t *testing.T, topDir, oldAgentHome, newAgentHome st
assert.Equal(t, []byte("Placeholder for agent 1.2.3-SNAPSHOT"), elasticAgentBytes, "reading elastic-agent content through symbolic link should point to the old version")
}

assert.NoFileExists(t, filepath.Join(topDir, "data", markerFilename), "update marker should have been cleaned up")
assert.FileExists(t, filepath.Join(topDir, "data", markerFilename), "update marker should survive cleanup in case of rollback")
}

// setupAgents create fake agent installs, update marker file and symlink pointing to one of the installations' elastic-agent placeholder
Expand Down
4 changes: 2 additions & 2 deletions internal/pkg/agent/application/upgrade/step_mark.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,8 @@ func loadMarker(markerFile string) (*UpdateMarker, error) {
// SaveMarker serializes and persists the given upgrade marker to disk.
// For critical upgrade transitions, pass shouldFsync as true so the marker
// file is immediately flushed to persistent storage.
func SaveMarker(marker *UpdateMarker, shouldFsync bool) error {
return saveMarkerToPath(marker, markerFilePath(paths.Data()), shouldFsync)
func SaveMarker(dataDirPath string, marker *UpdateMarker, shouldFsync bool) error {
return saveMarkerToPath(marker, markerFilePath(dataDirPath), shouldFsync)
}

func saveMarkerToPath(marker *UpdateMarker, markerFile string, shouldFsync bool) error {
Expand Down
4 changes: 2 additions & 2 deletions internal/pkg/agent/application/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ func waitForWatcherWithTimeoutCreationFunc(ctx context.Context, log *logger.Logg
}

case <-watcherContext.Done():
log.Error("upgrade watcher did not start watching within %s or context has expired", waitTime)
log.Errorf("upgrade watcher did not start watching within %s or context has expired", waitTime)
return goerrors.Join(ErrWatcherNotStarted, watcherContext.Err())
}
}
Expand Down Expand Up @@ -449,7 +449,7 @@ func (u *Upgrader) Ack(ctx context.Context, acker acker.Acker) error {

marker.Acked = true

return SaveMarker(marker, false)
return SaveMarker(paths.Data(), marker, false)
}

func (u *Upgrader) AckAction(ctx context.Context, acker acker.Acker, action fleetapi.Action) error {
Expand Down
3 changes: 3 additions & 0 deletions internal/pkg/agent/cmd/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,9 @@ func listUpgradeDetails(l list.Writer, upgradeDetails *cproto.UpgradeDetails) {
if upgradeDetails.Metadata.RetryErrorMsg != "" {
l.AppendItem("retry_error_msg: " + upgradeDetails.Metadata.RetryErrorMsg)
}
if upgradeDetails.Metadata.Reason != "" {
l.AppendItem("reason: " + upgradeDetails.Metadata.Reason)
}
l.UnIndent()
}

Expand Down
68 changes: 51 additions & 17 deletions internal/pkg/agent/cmd/watch.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func newWatchCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command
// Make sure to flush any buffered logs before we're done.
defer log.Sync() //nolint:errcheck // flushing buffered logs is best effort.

if err := watchCmd(log, cfg); err != nil {
if err := watchCmd(log, paths.Top(), cfg.Settings.Upgrade.Watcher, new(upgradeAgentWatcher), new(upgradeInstallationModifier)); err != nil {
log.Errorw("Watch command failed", "error.message", err)
fmt.Fprintf(streams.Err, "Watch command failed: %v\n%s\n", err, troubleshootMessage())
os.Exit(4)
Expand All @@ -64,21 +64,30 @@ func newWatchCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command
return cmd
}

func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {
log.Infow("Upgrade Watcher started", "process.pid", os.Getpid(), "agent.version", version.GetAgentPackageVersion())
marker, err := upgrade.LoadMarker(paths.Data())
type agentWatcher interface {
Watch(ctx context.Context, tilGrace, errorCheckInterval time.Duration, log *logp.Logger) error
}

type installationModifier interface {
Cleanup(log *logger.Logger, topDirPath, currentVersionedHome, currentHash string, removeMarker, keepLogs bool) error
Rollback(ctx context.Context, log *logger.Logger, c client.Client, topDirPath, prevVersionedHome, prevHash string) error
}

func watchCmd(log *logp.Logger, topDir string, cfg *configuration.UpgradeWatcherConfig, watcher agentWatcher, installModifier installationModifier) error {
log.Infow("Upgrade Watcher started", "process.pid", os.Getpid(), "agent.version", version.GetAgentPackageVersion(), "config", cfg)
dataDir := paths.DataFrom(topDir)
marker, err := upgrade.LoadMarker(dataDir)
if err != nil {
log.Error("failed to load marker", err)
return err
}
if marker == nil {
// no marker found we're not in upgrade process
log.Infof("update marker not present at '%s'", paths.Data())
log.Infof("update marker not present at '%s'", dataDir)
return nil
}

log.Infof("Loaded update marker %+v", marker)

log.With("marker", marker, "details", marker.Details).Info("Loaded update marker")
locker := filelock.NewAppLocker(paths.Top(), watcherLockFile)
if err := locker.TryLock(); err != nil {
if errors.Is(err, filelock.ErrAppAlreadyRunning) {
Expand All @@ -93,14 +102,18 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {
_ = locker.Unlock()
}()

isWithinGrace, tilGrace := gracePeriod(marker, cfg.Settings.Upgrade.Watcher.GracePeriod)
if !isWithinGrace {
log.Infof("not within grace [updatedOn %v] %v", marker.UpdatedOn.String(), time.Since(marker.UpdatedOn).String())
isWithinGrace, tilGrace := gracePeriod(marker, cfg.GracePeriod)
if isTerminalState(marker) || !isWithinGrace {
stateString := ""
if marker.Details != nil {
stateString = string(marker.Details.State)
}
log.Infof("not within grace [updatedOn %v] %v or agent have been rolled back [state: %s]", marker.UpdatedOn.String(), time.Since(marker.UpdatedOn).String(), stateString)
// if it is started outside of upgrade loop
// if we're not within grace and marker is still there it might mean
// that cleanup was not performed ok, cleanup everything except current version
// hash is the same as hash of agent which initiated watcher.
if err := upgrade.Cleanup(log, paths.Top(), paths.VersionedHome(paths.Top()), release.ShortCommit(), true, false); err != nil {
if err := installModifier.Cleanup(log, paths.Top(), paths.VersionedHome(topDir), release.ShortCommit(), true, false); err != nil {
log.Error("clean up of prior watcher run failed", err)
}
// exit nicely
Expand All @@ -109,15 +122,18 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {

// About to start watching the upgrade. Initialize upgrade details and save them in the
// upgrade marker.
upgradeDetails := initUpgradeDetails(marker, upgrade.SaveMarker, log)
saveMarkerFunc := func(marker *upgrade.UpdateMarker, b bool) error {
return upgrade.SaveMarker(dataDir, marker, b)
}
upgradeDetails := initUpgradeDetails(marker, saveMarkerFunc, log)

errorCheckInterval := cfg.Settings.Upgrade.Watcher.ErrorCheck.Interval
errorCheckInterval := cfg.ErrorCheck.Interval
ctx := context.Background()
if err := watch(ctx, tilGrace, errorCheckInterval, log); err != nil {
if err := watcher.Watch(ctx, tilGrace, errorCheckInterval, log); err != nil {
log.Error("Error detected, proceeding to rollback: %v", err)

upgradeDetails.SetState(details.StateRollback)
err = upgrade.Rollback(ctx, log, client.New(), paths.Top(), marker.PrevVersionedHome, marker.PrevHash)
upgradeDetails.SetStateWithReason(details.StateRollback, details.ReasonWatchFailed)
err = installModifier.Rollback(ctx, log, client.New(), paths.Top(), marker.PrevVersionedHome, marker.PrevHash)
if err != nil {
log.Error("rollback failed", err)
upgradeDetails.Fail(err)
Expand All @@ -135,13 +151,31 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error {
// Why is this being skipped on Windows? The comment above is not clear.
// issue: https://github.com/elastic/elastic-agent/issues/3027
removeMarker := !isWindows()
err = upgrade.Cleanup(log, paths.Top(), marker.VersionedHome, marker.Hash, removeMarker, false)
err = installModifier.Cleanup(log, topDir, marker.VersionedHome, marker.Hash, removeMarker, false)
if err != nil {
log.Error("cleanup after successful watch failed", err)
}
return err
}

// isTerminalState returns true if the state in the upgrade marker contains details and the upgrade details state is a
// terminal one: UPG_COMPLETE, UPG_ROLLBACK and UPG_FAILED
// If the upgrade marker or the upgrade marker details are nil the function will return false: as
// no state is specified, having simply a marker without details would mean that some upgrade operation is ongoing
// (probably initiated by an older agent).
func isTerminalState(marker *upgrade.UpdateMarker) bool {
if marker.Details == nil {
return false
}

switch marker.Details.State {
case details.StateCompleted, details.StateRollback, details.StateFailed:
return true
default:
return false
}
}

func isWindows() bool {
return runtime.GOOS == "windows"
}
Expand Down
Loading