Skip to content

Commit 9083969

Browse files
committed
feat: add three new etcd cold boot recovery tests
Add three new test cases to validate etcd cluster recovery from cold boot scenarios reached through different graceful/ungraceful shutdown combinations: - Cold boot from double GNS: both nodes gracefully shut down simultaneously, then both restart (full cluster cold boot) - Cold boot from sequential GNS: first node gracefully shut down, then second node gracefully shut down, then both restart - Cold boot from mixed GNS/UGNS: first node gracefully shut down, surviving node then ungracefully shut down, then both restart Note: The inverse case (UGNS first node, then GNS second) is not tested because in TNF clusters, an ungracefully shut down node is quickly recovered, preventing the ability to wait and gracefully shut down the second node later. The double UGNS scenario is already covered by existing tests.
1 parent 7bd72bf commit 9083969

File tree

1 file changed

+187
-20
lines changed

1 file changed

+187
-20
lines changed

test/extended/two_node/tnf_recovery.go

Lines changed: 187 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ const (
3232
memberPromotedVotingTimeout = 15 * time.Minute
3333
networkDisruptionDuration = 15 * time.Second
3434
vmRestartTimeout = 5 * time.Minute
35-
vmUngracefulShutdownTimeout = 30 * time.Second // Ungraceful VM shutdown is typically fast
36-
membersHealthyAfterDoubleReboot = 15 * time.Minute // It takes into account full VM recovering up to Etcd member healthy
35+
vmUngracefulShutdownTimeout = 30 * time.Second // Ungraceful shutdown is typically fast
36+
vmGracefulShutdownTimeout = 10 * time.Minute // Graceful shutdown is typically slow
37+
membersHealthyAfterDoubleReboot = 15 * time.Minute // It takes into account full VM reboot and Etcd member healthy
3738
pollInterval = 5 * time.Second
3839
)
3940

@@ -189,20 +190,12 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
189190
c, vmA, vmB, err := setupMinimalTestEnvironment(oc, &nodeA, &nodeB)
190191
o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error")
191192

192-
dataPair := []struct {
193-
vm, node string
194-
}{
193+
dataPair := []vmNodePair{
195194
{vmA, nodeA.Name},
196195
{vmB, nodeB.Name},
197196
}
198197

199-
defer func() {
200-
for _, d := range dataPair {
201-
if err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath); err != nil {
202-
fmt.Fprintf(g.GinkgoWriter, "Warning: failed to restart VM %s during cleanup: %v\n", d.vm, err)
203-
}
204-
}
205-
}()
198+
defer restartVms(dataPair, c)
206199

207200
g.By("Simulating double node failure: stopping both nodes' VMs")
208201
// First, stop all VMs
@@ -217,21 +210,130 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
217210
}
218211

219212
g.By("Restarting both nodes")
220-
// Start all VMs
213+
restartVms(dataPair, c)
214+
215+
g.By(fmt.Sprintf("Waiting both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot))
216+
// Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter.
217+
validateEtcdRecoveryState(oc, etcdClientFactory,
218+
&nodeA,
219+
&nodeB, true, false,
220+
membersHealthyAfterDoubleReboot, pollInterval)
221+
})
222+
223+
g.It("should recover from double graceful node shutdown", func() {
224+
// Note: Both nodes are gracefully shut down, then both restart
225+
nodeA := peerNode
226+
nodeB := targetNode
227+
g.GinkgoT().Printf("Testing double node graceful shutdown for %s and %s\n", nodeA.Name, nodeB.Name)
228+
229+
c, vmA, vmB, err := setupMinimalTestEnvironment(oc, &nodeA, &nodeB)
230+
o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error")
231+
232+
dataPair := []vmNodePair{
233+
{vmA, nodeA.Name},
234+
{vmB, nodeB.Name},
235+
}
236+
237+
defer restartVms(dataPair, c)
238+
239+
g.By(fmt.Sprintf("Gracefully shutting down both nodes at the same time (timeout: %v)", vmGracefulShutdownTimeout))
221240
for _, d := range dataPair {
222-
err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
223-
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to start VM %s (node: %s)", d.vm, d.node))
241+
innerErr := services.VirshShutdownVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
242+
o.Expect(innerErr).To(o.BeNil(), fmt.Sprintf("Expected to gracefully shutdown VM %s (node: %s)", d.vm, d.node))
224243
}
225-
// Wait for all to be running
244+
226245
for _, d := range dataPair {
227-
err := services.WaitForVMState(d.vm, services.VMStateRunning, vmUngracefulShutdownTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
228-
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to start in %s timeout", d.vm, d.node, vmRestartTimeout))
246+
innerErr := services.WaitForVMState(d.vm, services.VMStateShutOff, vmGracefulShutdownTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
247+
o.Expect(innerErr).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to reach shut off state", d.vm, d.node))
248+
}
249+
250+
g.By("Restarting both nodes")
251+
restartVms(dataPair, c)
252+
253+
g.By(fmt.Sprintf("Waiting both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot))
254+
// Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter.
255+
validateEtcdRecoveryState(oc, etcdClientFactory,
256+
&nodeA,
257+
&nodeB, true, false,
258+
membersHealthyAfterDoubleReboot, pollInterval)
259+
})
260+
261+
g.It("should recover from sequential graceful node shutdowns", func() {
262+
// Note: First node is gracefully shut down, then the second, then both restart
263+
firstToShutdown := peerNode
264+
secondToShutdown := targetNode
265+
g.GinkgoT().Printf("Testing sequential graceful shutdowns: first %s, then %s\n",
266+
firstToShutdown.Name, secondToShutdown.Name)
267+
268+
c, vmFirstToShutdown, vmSecondToShutdown, err := setupMinimalTestEnvironment(oc, &firstToShutdown, &secondToShutdown)
269+
o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error")
270+
271+
dataPair := []vmNodePair{
272+
{vmFirstToShutdown, firstToShutdown.Name},
273+
{vmSecondToShutdown, secondToShutdown.Name},
274+
}
275+
276+
defer restartVms(dataPair, c)
277+
278+
g.By(fmt.Sprintf("Gracefully shutting down first node: %s", firstToShutdown.Name))
279+
280+
err = vmShutdownAndWait(VMShutdownModeGraceful, vmFirstToShutdown, c)
281+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmFirstToShutdown))
282+
283+
g.By(fmt.Sprintf("Gracefully shutting down second node: %s", secondToShutdown.Name))
284+
err = vmShutdownAndWait(VMShutdownModeGraceful, vmSecondToShutdown, c)
285+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmSecondToShutdown))
286+
287+
g.By("Restarting both nodes")
288+
restartVms(dataPair, c)
289+
290+
g.By(fmt.Sprintf("Waiting both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot))
291+
// Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter.
292+
validateEtcdRecoveryState(oc, etcdClientFactory,
293+
&firstToShutdown,
294+
&secondToShutdown, true, false,
295+
membersHealthyAfterDoubleReboot, pollInterval)
296+
})
297+
298+
g.It("should recover from graceful shutdown followed by ungraceful node failure", func() {
299+
// Note: First node is gracefully shut down, then the survived node fails ungracefully
300+
firstToShutdown := targetNode
301+
secondToShutdown := peerNode
302+
g.GinkgoT().Printf("Randomly selected %s to shutdown gracefully and %s to survive, then fail ungracefully\n",
303+
firstToShutdown.Name, secondToShutdown.Name)
304+
305+
c, vmFirstToShutdown, vmSecondToShutdown, err := setupMinimalTestEnvironment(oc, &firstToShutdown, &secondToShutdown)
306+
o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error")
307+
308+
dataPair := []vmNodePair{
309+
{vmFirstToShutdown, firstToShutdown.Name},
310+
{vmSecondToShutdown, secondToShutdown.Name},
229311
}
230312

313+
defer restartVms(dataPair, c)
314+
315+
g.By(fmt.Sprintf("Gracefully shutting down VM %s (node: %s)", vmFirstToShutdown, firstToShutdown.Name))
316+
err = vmShutdownAndWait(VMShutdownModeGraceful, vmFirstToShutdown, c)
317+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmFirstToShutdown))
318+
319+
g.By(fmt.Sprintf("Waiting for %s to recover the etcd cluster standalone (timeout: %v)", secondToShutdown.Name, memberIsLeaderTimeout))
320+
validateEtcdRecoveryState(oc, etcdClientFactory,
321+
&secondToShutdown,
322+
&firstToShutdown, false, true, // expected started == false, learner == true
323+
memberIsLeaderTimeout, pollInterval)
324+
325+
g.By(fmt.Sprintf("Ungracefully shutting down VM %s (node: %s)", vmSecondToShutdown, secondToShutdown.Name))
326+
err = vmShutdownAndWait(VMShutdownModeUngraceful, vmSecondToShutdown, c)
327+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmSecondToShutdown))
328+
329+
g.By("Restarting both nodes")
330+
restartVms(dataPair, c)
331+
231332
g.By(fmt.Sprintf("Waiting both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot))
333+
// Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter.
232334
validateEtcdRecoveryState(oc, etcdClientFactory,
233-
&nodeA, // member on node A considered leader, hence started == true, learner == false
234-
&nodeB, true, false, // member on node B expected started == true, learner == false
335+
&firstToShutdown,
336+
&secondToShutdown, true, false,
235337
membersHealthyAfterDoubleReboot, pollInterval)
236338
})
237339
})
@@ -336,6 +438,12 @@ func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition
336438
return nil
337439
}
338440

441+
// validateEtcdRecoveryState polls the etcd cluster until the members match the expected state or a timeout is reached.
442+
//
443+
// This function assumes that the first node argument is always expected to be a healthy, voting member (isStarted=true, isLearner=false).
444+
// It validates the state of the second node argument against the provided `isTargetNodeStartedExpected` and `isTargetNodeLearnerExpected` booleans.
445+
//
446+
// When both nodes are expected to be healthy voting members, the order of the node arguments is interchangeable.
339447
func validateEtcdRecoveryState(
340448
oc *util.CLI, e *helpers.EtcdClientFactoryImpl,
341449
survivedNode, targetNode *corev1.Node,
@@ -541,3 +649,62 @@ func setupMinimalTestEnvironment(oc *util.CLI, nodeA, nodeB *corev1.Node) (c hyp
541649

542650
return
543651
}
652+
653+
type vmNodePair struct {
654+
vm, node string
655+
}
656+
657+
type VMShutdownMode int
658+
659+
const (
660+
VMShutdownModeGraceful VMShutdownMode = iota + 1
661+
VMShutdownModeUngraceful
662+
)
663+
664+
func (sm VMShutdownMode) String() string {
665+
switch sm {
666+
case VMShutdownModeGraceful:
667+
return "graceful VM shutdown"
668+
case VMShutdownModeUngraceful:
669+
return "ungraceful VM shutdown"
670+
}
671+
return "unknown vm shutdown mode"
672+
}
673+
674+
func vmShutdownAndWait(mode VMShutdownMode, vm string, c hypervisorExtendedConfig) error {
675+
var timeout time.Duration
676+
var shutdownFunc func(vmName string, sshConfig *core.SSHConfig, knownHostsPath string) error
677+
switch mode {
678+
case VMShutdownModeGraceful:
679+
timeout = vmGracefulShutdownTimeout
680+
shutdownFunc = services.VirshShutdownVM
681+
case VMShutdownModeUngraceful:
682+
timeout = vmUngracefulShutdownTimeout
683+
shutdownFunc = services.VirshDestroyVM
684+
default:
685+
return fmt.Errorf("unexpected VMShutdownMode: %s", mode)
686+
}
687+
688+
g.GinkgoT().Printf("%s: vm %s (timeout: %v)\n", mode, vm, timeout)
689+
err := shutdownFunc(vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
690+
if err != nil {
691+
return err
692+
}
693+
694+
return services.WaitForVMState(vm, services.VMStateShutOff, timeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
695+
}
696+
697+
func restartVms(dataPair []vmNodePair, c hypervisorExtendedConfig) {
698+
// Start all VMs asynchronously
699+
for _, d := range dataPair {
700+
if err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath); err != nil {
701+
fmt.Fprintf(g.GinkgoWriter, "Warning: failed to restart VM %s during cleanup: %v\n", d.vm, err)
702+
}
703+
}
704+
705+
// Wait for all VMs to be running
706+
for _, d := range dataPair {
707+
err := services.WaitForVMState(d.vm, services.VMStateRunning, vmRestartTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath)
708+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to start in %s timeout", d.vm, d.node, vmRestartTimeout))
709+
}
710+
}

0 commit comments

Comments
 (0)