Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
bfab5a3
Add rollback field to UpgradeRequest
pchila Jun 30, 2025
09fb69f
introduce rollback parameter to upgrade
pchila Jun 30, 2025
217b27b
manual rollback from CLI PoC
pchila Jul 1, 2025
9fd76a3
Concurrently retry taking over watcher
pchila Jul 1, 2025
c993ea3
Gracefully shutdown agent watcher
pchila Jul 7, 2025
c815e36
move desired outcome check before grace period evaluation
pchila Jul 9, 2025
0cbcd3e
Add rollbacks available to upgrade marker
pchila Jul 11, 2025
6c1512c
remove fakeAcker in favour of generated Acker mock
pchila Jul 14, 2025
a89ff38
Introduce WatcherHelper
pchila Jul 15, 2025
b0301ee
Add tests for available_rollbacks
pchila Jul 16, 2025
6260e19
Add tests for takeOverWatcher
pchila Jul 16, 2025
954b7d4
add testlocker binary to sonar exclusions
pchila Jul 21, 2025
9b210ba
disable rollback window by default
pchila Jul 25, 2025
fa2385c
Add formal checks to manual rollback arguments
pchila Jul 28, 2025
dd9de00
rename forceRollbackToPreviousVersion
pchila Jul 28, 2025
53fc518
test watchloop
pchila Jul 28, 2025
062ee82
Re-invoke watcher after takeover
pchila Aug 1, 2025
9b4634d
Add minimum version check for creating rollbacks entries in update ma…
pchila Aug 1, 2025
05dd366
Add manual rollback integration test
pchila Aug 2, 2025
80db726
Create watcher subprocess with a new Console on windows
pchila Aug 6, 2025
97a2ad2
Gracefully terminate watcher process on windows
pchila Aug 6, 2025
5e83d5e
Add watcher takedown tests
pchila Aug 7, 2025
40a7004
Add in-process watcher grappler
pchila Aug 11, 2025
33af908
WIP use in-process grappler
pchila Aug 11, 2025
60f7aef
remove in-process grappler in favor of commandGrappler
pchila Aug 11, 2025
59c5138
Allow watcher to listen to signals only during watch loop
pchila Aug 11, 2025
10b870f
Add postWatchHook to watcher process start to keep race detector happy
pchila Aug 12, 2025
57f3752
fix lint errors
pchila Aug 12, 2025
432dd4d
Fix data races in unit tests
pchila Aug 12, 2025
be41726
make watcher rollback only if the agent has not been already rolled back
pchila Aug 15, 2025
275c209
fix lint
pchila Aug 15, 2025
9cc948d
Add a pre-restart hook to Rollback operation
pchila Aug 15, 2025
1fcaa46
Update upgrade details metadata Equals() with new fields
pchila Aug 27, 2025
06a39b1
Remove parent death signal for watcher on linux
pchila Aug 28, 2025
9736c18
Refactor: move TakedownWatcher function to watch subcommand
pchila Aug 28, 2025
9736a35
Move RollbacksAvailable struct out of upgrade details
pchila Aug 29, 2025
a64b5ae
WIP - Distinguish between upgrade and rollback operations in upgrade …
pchila Sep 8, 2025
cefb6bf
WIP - remove DESIRED_OUTCOME in favor of watch --rollback
pchila Sep 10, 2025
d101259
fix lint errors
pchila Sep 11, 2025
bae09e3
fixup! fix lint errors
pchila Sep 11, 2025
ec75be9
Add tests for RollbackWithOpts
pchila Sep 11, 2025
f0afe86
Fix binary path in TestRollbackWithOpts on MacOS
pchila Sep 12, 2025
57a6197
fixup! Fix binary path in TestRollbackWithOpts on MacOS
pchila Sep 12, 2025
abb1dce
address code review feedback
pchila Sep 12, 2025
3a525d1
Fix printf-style assertion message
pchila Sep 12, 2025
3a994b4
fixup! fixup! Fix binary path in TestRollbackWithOpts on MacOS
pchila Sep 12, 2025
32e05c1
rename *_notwindows.go to *_other.go
pchila Sep 12, 2025
e2891d8
Add version agent rollbacks to in manual rollback reason
pchila Sep 15, 2025
4531c8e
define a constant for rollback window value that disables manual roll…
pchila Sep 15, 2025
2a5b008
fixup! Add version agent rollbacks to in manual rollback reason
pchila Sep 15, 2025
b8de122
Fix log levels for dumping processes IDs attached to the same windows…
pchila Sep 15, 2025
80f65af
add more descriptive constants for AgentWatcherHelper.TakeOverWatcher
pchila Sep 15, 2025
1a376f1
avoid shadowing rollback function with variables in upgrade subcommand
pchila Sep 15, 2025
f05a783
Check upgrade details state before allowing a manual rollback
pchila Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions .mockery.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,24 @@ packages:
interfaces:
Agent:
github.com/elastic/elastic-agent/internal/pkg/agent/cmd:
config:
inpackage: True
with-expecter: True
dir: "{{.InterfaceDirRelative}}"
mockname: "{{.Mock}}{{.InterfaceName | firstUpper}}"
outpkg: "{{.PackageName}}"
filename: "{{.Mock | lower}}_{{.InterfaceName | lower}}_test.go"
interfaces:
agentWatcher:
config:
mockname: "AgentWatcher"
installationModifier:
config:
mockname: "InstallationModifier"
github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade:
config:
inpackage: True
with-expecter: True
dir: "{{.InterfaceDirRelative}}"
mockname: "{{.Mock}}{{.InterfaceName | firstUpper}}"
outpkg: "{{.PackageName}}"
filename: "{{.Mock | lower}}_{{.InterfaceName | lower}}_test.go"
interfaces:
WatcherHelper:
watcherGrappler:
2 changes: 1 addition & 1 deletion _meta/config/common.reference.p2.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ inputs:
# # rollback settings
# rollback:
# # duration in which an upgraded Agent may be manually rolled back.
# window: 168h
# window: 0

# agent.process:
# # timeout for creating new processes. when process is not successfully created by this timeout
Expand Down
3 changes: 3 additions & 0 deletions control_v2.proto
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ message UpgradeRequest {
//
// If provided Elastic Agent package embedded PGP key is not checked for signature during upgrade.
bool skipDefaultPgp = 5;

// If true it indicates that we wish to rollback the current/last upgrade
bool rollback = 6;
}

// A upgrade response message.
Expand Down
2 changes: 1 addition & 1 deletion elastic-agent.reference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ inputs:
# # rollback settings
# rollback:
# # duration in which an upgraded Agent may be manually rolled back.
# window: 168h
# window: 0

# agent.process:
# # timeout for creating new processes. when process is not successfully created by this timeout
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,7 @@ func (u *mockUpgradeManager) Reload(rawConfig *config.Config) error {
return nil
}

func (u *mockUpgradeManager) Upgrade(
ctx context.Context,
version string,
sourceURI string,
action *fleetapi.ActionUpgrade,
details *details.Details,
skipVerifyOverride bool,
skipDefaultPgp bool,
pgpBytes ...string) (reexec.ShutdownCallbackFn, error) {
func (u *mockUpgradeManager) Upgrade(ctx context.Context, version string, rollback bool, sourceURI string, action *fleetapi.ActionUpgrade, details *details.Details, skipVerifyOverride bool, skipDefaultPgp bool, pgpBytes ...string) (reexec.ShutdownCallbackFn, error) {

return u.UpgradeFn(
ctx,
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ func New(

// monitoring is not supported in bootstrap mode https://github.com/elastic/elastic-agent/issues/1761
isMonitoringSupported := !disableMonitoring && cfg.Settings.V1MonitoringEnabled
upgrader, err := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, agentInfo)
upgrader, err := upgrade.NewUpgrader(log, cfg.Settings.DownloadConfig, cfg.Settings.Upgrade, agentInfo, new(upgrade.AgentWatcherHelper))
if err != nil {
return nil, nil, nil, fmt.Errorf("failed to create upgrader: %w", err)
}
Expand Down
14 changes: 11 additions & 3 deletions internal/pkg/agent/application/coordinator/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ type UpgradeManager interface {
Reload(rawConfig *config.Config) error

// Upgrade upgrades running agent.
Upgrade(ctx context.Context, version string, sourceURI string, action *fleetapi.ActionUpgrade, details *details.Details, skipVerifyOverride bool, skipDefaultPgp bool, pgpBytes ...string) (_ reexec.ShutdownCallbackFn, err error)
Upgrade(ctx context.Context, version string, rollback bool, sourceURI string, action *fleetapi.ActionUpgrade, details *details.Details, skipVerifyOverride bool, skipDefaultPgp bool, pgpBytes ...string) (_ reexec.ShutdownCallbackFn, err error)

// Ack is used on startup to check if the agent has upgraded and needs to send an ack for the action
Ack(ctx context.Context, acker acker.Acker) error
Expand Down Expand Up @@ -700,6 +700,7 @@ type upgradeOpts struct {
skipDefaultPgp bool
pgpBytes []string
preUpgradeCallback func(ctx context.Context, log *logger.Logger, action *fleetapi.ActionUpgrade) error
rollback bool
}

type UpgradeOpt func(*upgradeOpts)
Expand Down Expand Up @@ -728,6 +729,12 @@ func WithPreUpgradeCallback(preUpgradeCallback func(ctx context.Context, log *lo
}
}

func WithRollback(rollback bool) UpgradeOpt {
return func(opts *upgradeOpts) {
opts.rollback = rollback
}
}

// Upgrade runs the upgrade process.
// Called from external goroutines.
func (c *Coordinator) Upgrade(ctx context.Context, version string, sourceURI string, action *fleetapi.ActionUpgrade, opts ...UpgradeOpt) error {
Expand All @@ -741,7 +748,8 @@ func (c *Coordinator) Upgrade(ctx context.Context, version string, sourceURI str
var err error
for i := 0; i < 5; i++ {
s := c.State()
if s.State != agentclient.Upgrading {
// if we are not already upgrading or if the incoming is a rollback request while the watcher is running, we can continue processing
if s.State != agentclient.Upgrading || (uOpts.rollback && s.UpgradeDetails != nil && s.UpgradeDetails.State == details.StateWatching) {
err = nil
break
}
Expand Down Expand Up @@ -785,7 +793,7 @@ func (c *Coordinator) Upgrade(ctx context.Context, version string, sourceURI str
}
}

cb, err := c.upgradeMgr.Upgrade(ctx, version, sourceURI, action, det, uOpts.skipVerifyOverride, uOpts.skipDefaultPgp, uOpts.pgpBytes...)
cb, err := c.upgradeMgr.Upgrade(ctx, version, uOpts.rollback, sourceURI, action, det, uOpts.skipVerifyOverride, uOpts.skipDefaultPgp, uOpts.pgpBytes...)
if err != nil {
c.ClearOverrideState()
if errors.Is(err, upgrade.ErrUpgradeSameVersion) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1204,7 +1204,7 @@ func (f *fakeUpgradeManager) Reload(cfg *config.Config) error {
return nil
}

func (f *fakeUpgradeManager) Upgrade(ctx context.Context, version string, sourceURI string, action *fleetapi.ActionUpgrade, details *details.Details, skipVerifyOverride bool, skipDefaultPgp bool, pgpBytes ...string) (_ reexec.ShutdownCallbackFn, err error) {
func (f *fakeUpgradeManager) Upgrade(ctx context.Context, version string, rollback bool, sourceURI string, action *fleetapi.ActionUpgrade, details *details.Details, skipVerifyOverride bool, skipDefaultPgp bool, pgpBytes ...string) (_ reexec.ShutdownCallbackFn, err error) {
f.upgradeCalled = true
if f.upgradeErr != nil {
return nil, f.upgradeErr
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -462,11 +462,7 @@ func TestCoordinatorReportsInvalidPolicy(t *testing.T) {
}
}()

upgradeMgr, err := upgrade.NewUpgrader(
log,
&artifact.Config{},
&info.AgentInfo{},
)
upgradeMgr, err := upgrade.NewUpgrader(log, &artifact.Config{}, nil, &info.AgentInfo{}, new(upgrade.AgentWatcherHelper))
require.NoError(t, err, "errored when creating a new upgrader")

// Channels have buffer length 1, so we don't have to run on multiple
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/agent/application/filelock/locker.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ func (a *AppLocker) TryLock() error {
if !locked {
return ErrAppAlreadyRunning
}

return nil
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Ignore test binary
testlocker
64 changes: 64 additions & 0 deletions internal/pkg/agent/application/filelock/testlocker/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
// or more contributor license agreements. Licensed under the Elastic License 2.0;
// you may not use this file except in compliance with the Elastic License 2.0.

// This is a simple program that will lock an applocker using a file passed using the -lockfile option, used for testing file lock works properly.
// os.Interrupt or signal.SIGTERM will make the program release the lock and exit
package main

import (
"flag"
"log"
"os"
"os/signal"
"path/filepath"
"syscall"

"github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock"
)

const AcquiredLockLogFmt = "Acquired lock on file %s\n"

const lockFileFlagName = "lockfile"
const ignoreSignalFlagName = "ignoresignals"

var lockFile = flag.String(lockFileFlagName, "", "path to lock file")
var ignoreSignals = flag.Bool(ignoreSignalFlagName, false, "ignore signals")

func main() {
signalChan := make(chan os.Signal, 1)
signal.Notify(signalChan, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)

flag.Parse()
if *lockFile == "" {
log.Fatalf("No lockfile specified. Please run %s -%s <path to lockfile>", os.Args[0], lockFileFlagName)
}

appLocker := filelock.NewAppLocker(filepath.Dir(*lockFile), filepath.Base(*lockFile))

err := appLocker.TryLock()
if err != nil {
log.Fatalf("Error locking %s: %s", *lockFile, err.Error())
}

defer func(aLocker *filelock.AppLocker) {

if unlockErr := aLocker.Unlock(); unlockErr != nil {
log.Printf("Error unlocking %s: %s", *lockFile, unlockErr.Error())
}
}(appLocker)

log.Printf(AcquiredLockLogFmt, *lockFile)

for {

s := <-signalChan
if *ignoreSignals {
log.Printf("Received signal %v , ignoring it...", s)
continue
}

log.Printf("Received signal %v , exiting...", s)
break
}
}
3 changes: 2 additions & 1 deletion internal/pkg/agent/application/upgrade/details/details.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ func (m Metadata) Equals(otherM Metadata) bool {
m.DownloadPercent == otherM.DownloadPercent &&
m.DownloadRate == otherM.DownloadRate &&
equalTimePointers(m.RetryUntil, otherM.RetryUntil) &&
m.RetryErrorMsg == otherM.RetryErrorMsg
m.RetryErrorMsg == otherM.RetryErrorMsg &&
m.Reason == otherM.Reason
}

func equalTimePointers(t, otherT *time.Time) bool {
Expand Down
3 changes: 2 additions & 1 deletion internal/pkg/agent/application/upgrade/details/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ const (
StateFailed State = "UPG_FAILED"

// List of well-known reasons for state transitions
ReasonWatchFailed = "watch failed"
ReasonWatchFailed = "watch failed"
ReasonManualRollbackPattern = "manual rollback requested to version %s"
)
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ details:
expectedDetails: &details.Details{
TargetVersion: "8.9.2",
State: details.StateRollback,
Metadata: details.Metadata{
Reason: details.ReasonWatchFailed,
},
},
},
"same_version_with_details_some_state": {
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading