@@ -32,8 +32,9 @@ const (
3232 memberPromotedVotingTimeout = 15 * time .Minute
3333 networkDisruptionDuration = 15 * time .Second
3434 vmRestartTimeout = 5 * time .Minute
35- vmUngracefulShutdownTimeout = 30 * time .Second // Ungraceful VM shutdown is typically fast
36- membersHealthyAfterDoubleReboot = 15 * time .Minute // It takes into account full VM recovering up to Etcd member healthy
35+ vmUngracefulShutdownTimeout = 30 * time .Second // Ungraceful shutdown is typically fast
36+ vmGracefulShutdownTimeout = 10 * time .Minute // Graceful shutdown is typically slow
37+ membersHealthyAfterDoubleReboot = 15 * time .Minute // It takes into account full VM reboot and Etcd member healthy
3738 pollInterval = 5 * time .Second
3839)
3940
@@ -189,20 +190,12 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
189190 c , vmA , vmB , err := setupMinimalTestEnvironment (oc , & nodeA , & nodeB )
190191 o .Expect (err ).To (o .BeNil (), "Expected to setup test environment without error" )
191192
192- dataPair := []struct {
193- vm , node string
194- }{
193+ dataPair := []vmNodePair {
195194 {vmA , nodeA .Name },
196195 {vmB , nodeB .Name },
197196 }
198197
199- defer func () {
200- for _ , d := range dataPair {
201- if err := services .VirshStartVM (d .vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath ); err != nil {
202- fmt .Fprintf (g .GinkgoWriter , "Warning: failed to restart VM %s during cleanup: %v\n " , d .vm , err )
203- }
204- }
205- }()
198+ defer restartVms (dataPair , c )
206199
207200 g .By ("Simulating double node failure: stopping both nodes' VMs" )
208201 // First, stop all VMs
@@ -217,21 +210,130 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
217210 }
218211
219212 g .By ("Restarting both nodes" )
220- // Start all VMs
213+ restartVms (dataPair , c )
214+
215+ g .By (fmt .Sprintf ("Waiting both etcd members to become healthy (timeout: %v)" , membersHealthyAfterDoubleReboot ))
216+ // Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter.
217+ validateEtcdRecoveryState (oc , etcdClientFactory ,
218+ & nodeA ,
219+ & nodeB , true , false ,
220+ membersHealthyAfterDoubleReboot , pollInterval )
221+ })
222+
223+ g .It ("should recover from double graceful node shutdown" , func () {
224+ // Note: Both nodes are gracefully shut down, then both restart
225+ nodeA := peerNode
226+ nodeB := targetNode
227+ g .GinkgoT ().Printf ("Testing double node graceful shutdown for %s and %s\n " , nodeA .Name , nodeB .Name )
228+
229+ c , vmA , vmB , err := setupMinimalTestEnvironment (oc , & nodeA , & nodeB )
230+ o .Expect (err ).To (o .BeNil (), "Expected to setup test environment without error" )
231+
232+ dataPair := []vmNodePair {
233+ {vmA , nodeA .Name },
234+ {vmB , nodeB .Name },
235+ }
236+
237+ defer restartVms (dataPair , c )
238+
239+ g .By (fmt .Sprintf ("Gracefully shutting down both nodes at the same time (timeout: %v)" , vmGracefulShutdownTimeout ))
221240 for _ , d := range dataPair {
222- err := services .VirshStartVM (d .vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
223- o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected to start VM %s (node: %s)" , d .vm , d .node ))
241+ innerErr := services .VirshShutdownVM (d .vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
242+ o .Expect (innerErr ).To (o .BeNil (), fmt .Sprintf ("Expected to gracefully shutdown VM %s (node: %s)" , d .vm , d .node ))
224243 }
225- // Wait for all to be running
244+
226245 for _ , d := range dataPair {
227- err := services .WaitForVMState (d .vm , services .VMStateRunning , vmUngracefulShutdownTimeout , pollInterval , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
228- o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s (node: %s) to start in %s timeout" , d .vm , d .node , vmRestartTimeout ))
246+ innerErr := services .WaitForVMState (d .vm , services .VMStateShutOff , vmGracefulShutdownTimeout , pollInterval , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
247+ o .Expect (innerErr ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s (node: %s) to reach shut off state" , d .vm , d .node ))
248+ }
249+
250+ g .By ("Restarting both nodes" )
251+ restartVms (dataPair , c )
252+
253+ g .By (fmt .Sprintf ("Waiting both etcd members to become healthy (timeout: %v)" , membersHealthyAfterDoubleReboot ))
254+ // Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter.
255+ validateEtcdRecoveryState (oc , etcdClientFactory ,
256+ & nodeA ,
257+ & nodeB , true , false ,
258+ membersHealthyAfterDoubleReboot , pollInterval )
259+ })
260+
261+ g .It ("should recover from sequential graceful node shutdowns" , func () {
262+ // Note: First node is gracefully shut down, then the second, then both restart
263+ firstToShutdown := peerNode
264+ secondToShutdown := targetNode
265+ g .GinkgoT ().Printf ("Testing sequential graceful shutdowns: first %s, then %s\n " ,
266+ firstToShutdown .Name , secondToShutdown .Name )
267+
268+ c , vmFirstToShutdown , vmSecondToShutdown , err := setupMinimalTestEnvironment (oc , & firstToShutdown , & secondToShutdown )
269+ o .Expect (err ).To (o .BeNil (), "Expected to setup test environment without error" )
270+
271+ dataPair := []vmNodePair {
272+ {vmFirstToShutdown , firstToShutdown .Name },
273+ {vmSecondToShutdown , secondToShutdown .Name },
274+ }
275+
276+ defer restartVms (dataPair , c )
277+
278+ g .By (fmt .Sprintf ("Gracefully shutting down first node: %s" , firstToShutdown .Name ))
279+
280+ err = vmShutdownAndWait (VMShutdownModeGraceful , vmFirstToShutdown , c )
281+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s to reach shut off state" , vmFirstToShutdown ))
282+
283+ g .By (fmt .Sprintf ("Gracefully shutting down second node: %s" , secondToShutdown .Name ))
284+ err = vmShutdownAndWait (VMShutdownModeGraceful , vmSecondToShutdown , c )
285+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s to reach shut off state" , vmSecondToShutdown ))
286+
287+ g .By ("Restarting both nodes" )
288+ restartVms (dataPair , c )
289+
290+ g .By (fmt .Sprintf ("Waiting both etcd members to become healthy (timeout: %v)" , membersHealthyAfterDoubleReboot ))
291+ // Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter.
292+ validateEtcdRecoveryState (oc , etcdClientFactory ,
293+ & firstToShutdown ,
294+ & secondToShutdown , true , false ,
295+ membersHealthyAfterDoubleReboot , pollInterval )
296+ })
297+
298+ g .It ("should recover from graceful shutdown followed by ungraceful node failure" , func () {
299+ // Note: First node is gracefully shut down, then the survived node fails ungracefully
300+ firstToShutdown := targetNode
301+ secondToShutdown := peerNode
302+ g .GinkgoT ().Printf ("Randomly selected %s to shutdown gracefully and %s to survive, then fail ungracefully\n " ,
303+ firstToShutdown .Name , secondToShutdown .Name )
304+
305+ c , vmFirstToShutdown , vmSecondToShutdown , err := setupMinimalTestEnvironment (oc , & firstToShutdown , & secondToShutdown )
306+ o .Expect (err ).To (o .BeNil (), "Expected to setup test environment without error" )
307+
308+ dataPair := []vmNodePair {
309+ {vmFirstToShutdown , firstToShutdown .Name },
310+ {vmSecondToShutdown , secondToShutdown .Name },
229311 }
230312
313+ defer restartVms (dataPair , c )
314+
315+ g .By (fmt .Sprintf ("Gracefully shutting down VM %s (node: %s)" , vmFirstToShutdown , firstToShutdown .Name ))
316+ err = vmShutdownAndWait (VMShutdownModeGraceful , vmFirstToShutdown , c )
317+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s to reach shut off state" , vmFirstToShutdown ))
318+
319+ g .By (fmt .Sprintf ("Waiting for %s to recover the etcd cluster standalone (timeout: %v)" , secondToShutdown .Name , memberIsLeaderTimeout ))
320+ validateEtcdRecoveryState (oc , etcdClientFactory ,
321+ & secondToShutdown ,
322+ & firstToShutdown , false , true , // expected started == false, learner == true
323+ memberIsLeaderTimeout , pollInterval )
324+
325+ g .By (fmt .Sprintf ("Ungracefully shutting down VM %s (node: %s)" , vmSecondToShutdown , secondToShutdown .Name ))
326+ err = vmShutdownAndWait (VMShutdownModeUngraceful , vmSecondToShutdown , c )
327+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s to reach shut off state" , vmSecondToShutdown ))
328+
329+ g .By ("Restarting both nodes" )
330+ restartVms (dataPair , c )
331+
231332 g .By (fmt .Sprintf ("Waiting both etcd members to become healthy (timeout: %v)" , membersHealthyAfterDoubleReboot ))
333+ // Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter.
232334 validateEtcdRecoveryState (oc , etcdClientFactory ,
233- & nodeA , // member on node A considered leader, hence started == true, learner == false
234- & nodeB , true , false , // member on node B expected started == true, learner == false
335+ & firstToShutdown ,
336+ & secondToShutdown , true , false ,
235337 membersHealthyAfterDoubleReboot , pollInterval )
236338 })
237339})
@@ -336,6 +438,12 @@ func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition
336438 return nil
337439}
338440
441+ // validateEtcdRecoveryState polls the etcd cluster until the members match the expected state or a timeout is reached.
442+ //
443+ // This function assumes that the first node argument is always expected to be a healthy, voting member (isStarted=true, isLearner=false).
444+ // It validates the state of the second node argument against the provided `isTargetNodeStartedExpected` and `isTargetNodeLearnerExpected` booleans.
445+ //
446+ // When both nodes are expected to be healthy voting members, the order of the node arguments is interchangeable.
339447func validateEtcdRecoveryState (
340448 oc * util.CLI , e * helpers.EtcdClientFactoryImpl ,
341449 survivedNode , targetNode * corev1.Node ,
@@ -541,3 +649,62 @@ func setupMinimalTestEnvironment(oc *util.CLI, nodeA, nodeB *corev1.Node) (c hyp
541649
542650 return
543651}
652+
653+ type vmNodePair struct {
654+ vm , node string
655+ }
656+
657+ type VMShutdownMode int
658+
659+ const (
660+ VMShutdownModeGraceful VMShutdownMode = iota + 1
661+ VMShutdownModeUngraceful
662+ )
663+
664+ func (sm VMShutdownMode ) String () string {
665+ switch sm {
666+ case VMShutdownModeGraceful :
667+ return "graceful VM shutdown"
668+ case VMShutdownModeUngraceful :
669+ return "ungraceful VM shutdown"
670+ }
671+ return "unknown vm shutdown mode"
672+ }
673+
674+ func vmShutdownAndWait (mode VMShutdownMode , vm string , c hypervisorExtendedConfig ) error {
675+ var timeout time.Duration
676+ var shutdownFunc func (vmName string , sshConfig * core.SSHConfig , knownHostsPath string ) error
677+ switch mode {
678+ case VMShutdownModeGraceful :
679+ timeout = vmGracefulShutdownTimeout
680+ shutdownFunc = services .VirshShutdownVM
681+ case VMShutdownModeUngraceful :
682+ timeout = vmUngracefulShutdownTimeout
683+ shutdownFunc = services .VirshDestroyVM
684+ default :
685+ return fmt .Errorf ("unexpected VMShutdownMode: %s" , mode )
686+ }
687+
688+ g .GinkgoT ().Printf ("%s: vm %s (timeout: %v)\n " , mode , vm , timeout )
689+ err := shutdownFunc (vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
690+ if err != nil {
691+ return err
692+ }
693+
694+ return services .WaitForVMState (vm , services .VMStateShutOff , timeout , pollInterval , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
695+ }
696+
697+ func restartVms (dataPair []vmNodePair , c hypervisorExtendedConfig ) {
698+ // Start all VMs asynchronously
699+ for _ , d := range dataPair {
700+ if err := services .VirshStartVM (d .vm , & c .HypervisorConfig , c .HypervisorKnownHostsPath ); err != nil {
701+ fmt .Fprintf (g .GinkgoWriter , "Warning: failed to restart VM %s during cleanup: %v\n " , d .vm , err )
702+ }
703+ }
704+
705+ // Wait for all VMs to be running
706+ for _ , d := range dataPair {
707+ err := services .WaitForVMState (d .vm , services .VMStateRunning , vmRestartTimeout , pollInterval , & c .HypervisorConfig , c .HypervisorKnownHostsPath )
708+ o .Expect (err ).To (o .BeNil (), fmt .Sprintf ("Expected VM %s (node: %s) to start in %s timeout" , d .vm , d .node , vmRestartTimeout ))
709+ }
710+ }
0 commit comments