From 7bd72bfbd6b329c03c38f3e214afbde283c5b6bd Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Thu, 16 Oct 2025 15:58:26 +0200 Subject: [PATCH 1/6] feat: add VirshShutdownVM helper for graceful VM shutdown --- test/extended/two_node/utils/services/libvirt.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/extended/two_node/utils/services/libvirt.go b/test/extended/two_node/utils/services/libvirt.go index c7b654f0c6a9..c6790069435b 100644 --- a/test/extended/two_node/utils/services/libvirt.go +++ b/test/extended/two_node/utils/services/libvirt.go @@ -171,6 +171,20 @@ func VirshDestroyVM(vmName string, sshConfig *core.SSHConfig, knownHostsPath str return err } +// VirshShutdownVM gracefully shuts down a running VM (allows guest OS to shutdown cleanly). +// +// err := VirshShutdownVM("master-0", sshConfig, knownHostsPath) +func VirshShutdownVM(vmName string, sshConfig *core.SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("VirshShutdownVM: Gracefully shutting down VM '%s'", vmName) + _, err := VirshCommand(fmt.Sprintf("shutdown %s", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshShutdownVM failed", "vm", vmName) + } else { + klog.V(2).Infof("VirshShutdownVM: Successfully initiated shutdown for VM '%s'", vmName) + } + return err +} + // VirshDefineVM defines a new VM from an XML configuration file. // // err := VirshDefineVM("/tmp/master-0.xml", sshConfig, knownHostsPath) From b6e1384dd39547e1baab2c86712438a17a8c9c67 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Tue, 21 Oct 2025 15:18:19 +0200 Subject: [PATCH 2/6] feat: add three new etcd cold boot recovery tests Add three new test cases to validate etcd cluster recovery from cold boot scenarios reached through different graceful/ungraceful shutdown combinations: - Cold boot from double GNS: both nodes gracefully shut down simultaneously, then both restart (full cluster cold boot) - Cold boot from sequential GNS: first node gracefully shut down, then second node gracefully shut down, then both restart - Cold boot from mixed GNS/UGNS: first node gracefully shut down, surviving node then ungracefully shut down, then both restart Note: The inverse case (UGNS first node, then GNS second) is not tested because in TNF clusters, an ungracefully shut down node is quickly recovered, preventing the ability to wait and gracefully shut down the second node later. The double UGNS scenario is already covered by existing tests. --- test/extended/two_node/tnf_recovery.go | 209 ++++++++++++++++++++++--- 1 file changed, 188 insertions(+), 21 deletions(-) diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index b40e5f71a8d7..fe8ba82cdf50 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -32,8 +32,9 @@ const ( memberPromotedVotingTimeout = 15 * time.Minute networkDisruptionDuration = 15 * time.Second vmRestartTimeout = 5 * time.Minute - vmUngracefulShutdownTimeout = 30 * time.Second // Ungraceful VM shutdown is typically fast - membersHealthyAfterDoubleReboot = 15 * time.Minute // It takes into account full VM recovering up to Etcd member healthy + vmUngracefulShutdownTimeout = 30 * time.Second // Ungraceful shutdown is typically fast + vmGracefulShutdownTimeout = 10 * time.Minute // Graceful shutdown is typically slow + membersHealthyAfterDoubleReboot = 15 * time.Minute // It takes into account full VM reboot and Etcd member healthy pollInterval = 5 * time.Second ) @@ -181,7 +182,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual memberPromotedVotingTimeout, pollInterval) }) - g.It("should recover from a double node failure", func() { + g.It("should recover from a double node failure (cold-boot)", func() { // Note: In a double node failure both nodes have the same role, hence we // will call them just NodeA and NodeB nodeA := peerNode @@ -189,20 +190,12 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual c, vmA, vmB, err := setupMinimalTestEnvironment(oc, &nodeA, &nodeB) o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error") - dataPair := []struct { - vm, node string - }{ + dataPair := []vmNodePair{ {vmA, nodeA.Name}, {vmB, nodeB.Name}, } - defer func() { - for _, d := range dataPair { - if err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath); err != nil { - fmt.Fprintf(g.GinkgoWriter, "Warning: failed to restart VM %s during cleanup: %v\n", d.vm, err) - } - } - }() + defer restartVms(dataPair, c) g.By("Simulating double node failure: stopping both nodes' VMs") // First, stop all VMs @@ -217,21 +210,130 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual } g.By("Restarting both nodes") - // Start all VMs + restartVms(dataPair, c) + + g.By(fmt.Sprintf("Waiting both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot)) + // Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter. + validateEtcdRecoveryState(oc, etcdClientFactory, + &nodeA, + &nodeB, true, false, + membersHealthyAfterDoubleReboot, pollInterval) + }) + + g.It("should recover from double graceful node shutdown (cold-boot)", func() { + // Note: Both nodes are gracefully shut down, then both restart + nodeA := peerNode + nodeB := targetNode + g.GinkgoT().Printf("Testing double node graceful shutdown for %s and %s\n", nodeA.Name, nodeB.Name) + + c, vmA, vmB, err := setupMinimalTestEnvironment(oc, &nodeA, &nodeB) + o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error") + + dataPair := []vmNodePair{ + {vmA, nodeA.Name}, + {vmB, nodeB.Name}, + } + + defer restartVms(dataPair, c) + + g.By(fmt.Sprintf("Gracefully shutting down both nodes at the same time (timeout: %v)", vmGracefulShutdownTimeout)) for _, d := range dataPair { - err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath) - o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to start VM %s (node: %s)", d.vm, d.node)) + innerErr := services.VirshShutdownVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath) + o.Expect(innerErr).To(o.BeNil(), fmt.Sprintf("Expected to gracefully shutdown VM %s (node: %s)", d.vm, d.node)) } - // Wait for all to be running + for _, d := range dataPair { - err := services.WaitForVMState(d.vm, services.VMStateRunning, vmUngracefulShutdownTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath) - o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to start in %s timeout", d.vm, d.node, vmRestartTimeout)) + innerErr := services.WaitForVMState(d.vm, services.VMStateShutOff, vmGracefulShutdownTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath) + o.Expect(innerErr).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to reach shut off state", d.vm, d.node)) + } + + g.By("Restarting both nodes") + restartVms(dataPair, c) + + g.By(fmt.Sprintf("Waiting both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot)) + // Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter. + validateEtcdRecoveryState(oc, etcdClientFactory, + &nodeA, + &nodeB, true, false, + membersHealthyAfterDoubleReboot, pollInterval) + }) + + g.It("should recover from sequential graceful node shutdowns (cold-boot)", func() { + // Note: First node is gracefully shut down, then the second, then both restart + firstToShutdown := peerNode + secondToShutdown := targetNode + g.GinkgoT().Printf("Testing sequential graceful shutdowns: first %s, then %s\n", + firstToShutdown.Name, secondToShutdown.Name) + + c, vmFirstToShutdown, vmSecondToShutdown, err := setupMinimalTestEnvironment(oc, &firstToShutdown, &secondToShutdown) + o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error") + + dataPair := []vmNodePair{ + {vmFirstToShutdown, firstToShutdown.Name}, + {vmSecondToShutdown, secondToShutdown.Name}, + } + + defer restartVms(dataPair, c) + + g.By(fmt.Sprintf("Gracefully shutting down first node: %s", firstToShutdown.Name)) + + err = vmShutdownAndWait(VMShutdownModeGraceful, vmFirstToShutdown, c) + o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmFirstToShutdown)) + + g.By(fmt.Sprintf("Gracefully shutting down second node: %s", secondToShutdown.Name)) + err = vmShutdownAndWait(VMShutdownModeGraceful, vmSecondToShutdown, c) + o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmSecondToShutdown)) + + g.By("Restarting both nodes") + restartVms(dataPair, c) + + g.By(fmt.Sprintf("Waiting both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot)) + // Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter. + validateEtcdRecoveryState(oc, etcdClientFactory, + &firstToShutdown, + &secondToShutdown, true, false, + membersHealthyAfterDoubleReboot, pollInterval) + }) + + g.It("should recover from graceful shutdown followed by ungraceful node failure (cold-boot)", func() { + // Note: First node is gracefully shut down, then the survived node fails ungracefully + firstToShutdown := targetNode + secondToShutdown := peerNode + g.GinkgoT().Printf("Randomly selected %s to shutdown gracefully and %s to survive, then fail ungracefully\n", + firstToShutdown.Name, secondToShutdown.Name) + + c, vmFirstToShutdown, vmSecondToShutdown, err := setupMinimalTestEnvironment(oc, &firstToShutdown, &secondToShutdown) + o.Expect(err).To(o.BeNil(), "Expected to setup test environment without error") + + dataPair := []vmNodePair{ + {vmFirstToShutdown, firstToShutdown.Name}, + {vmSecondToShutdown, secondToShutdown.Name}, } + defer restartVms(dataPair, c) + + g.By(fmt.Sprintf("Gracefully shutting down VM %s (node: %s)", vmFirstToShutdown, firstToShutdown.Name)) + err = vmShutdownAndWait(VMShutdownModeGraceful, vmFirstToShutdown, c) + o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmFirstToShutdown)) + + g.By(fmt.Sprintf("Waiting for %s to recover the etcd cluster standalone (timeout: %v)", secondToShutdown.Name, memberIsLeaderTimeout)) + validateEtcdRecoveryState(oc, etcdClientFactory, + &secondToShutdown, + &firstToShutdown, false, true, // expected started == false, learner == true + memberIsLeaderTimeout, pollInterval) + + g.By(fmt.Sprintf("Ungracefully shutting down VM %s (node: %s)", vmSecondToShutdown, secondToShutdown.Name)) + err = vmShutdownAndWait(VMShutdownModeUngraceful, vmSecondToShutdown, c) + o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s to reach shut off state", vmSecondToShutdown)) + + g.By("Restarting both nodes") + restartVms(dataPair, c) + g.By(fmt.Sprintf("Waiting both etcd members to become healthy (timeout: %v)", membersHealthyAfterDoubleReboot)) + // Both nodes are expected to be healthy voting members. The order of nodes passed to the validation function does not matter. validateEtcdRecoveryState(oc, etcdClientFactory, - &nodeA, // member on node A considered leader, hence started == true, learner == false - &nodeB, true, false, // member on node B expected started == true, learner == false + &firstToShutdown, + &secondToShutdown, true, false, membersHealthyAfterDoubleReboot, pollInterval) }) }) @@ -336,6 +438,12 @@ func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition return nil } +// validateEtcdRecoveryState polls the etcd cluster until the members match the expected state or a timeout is reached. +// +// This function assumes that the first node argument is always expected to be a healthy, voting member (isStarted=true, isLearner=false). +// It validates the state of the second node argument against the provided `isTargetNodeStartedExpected` and `isTargetNodeLearnerExpected` booleans. +// +// When both nodes are expected to be healthy voting members, the order of the node arguments is interchangeable. func validateEtcdRecoveryState( oc *util.CLI, e *helpers.EtcdClientFactoryImpl, survivedNode, targetNode *corev1.Node, @@ -541,3 +649,62 @@ func setupMinimalTestEnvironment(oc *util.CLI, nodeA, nodeB *corev1.Node) (c hyp return } + +type vmNodePair struct { + vm, node string +} + +type VMShutdownMode int + +const ( + VMShutdownModeGraceful VMShutdownMode = iota + 1 + VMShutdownModeUngraceful +) + +func (sm VMShutdownMode) String() string { + switch sm { + case VMShutdownModeGraceful: + return "graceful VM shutdown" + case VMShutdownModeUngraceful: + return "ungraceful VM shutdown" + } + return "unknown vm shutdown mode" +} + +func vmShutdownAndWait(mode VMShutdownMode, vm string, c hypervisorExtendedConfig) error { + var timeout time.Duration + var shutdownFunc func(vmName string, sshConfig *core.SSHConfig, knownHostsPath string) error + switch mode { + case VMShutdownModeGraceful: + timeout = vmGracefulShutdownTimeout + shutdownFunc = services.VirshShutdownVM + case VMShutdownModeUngraceful: + timeout = vmUngracefulShutdownTimeout + shutdownFunc = services.VirshDestroyVM + default: + return fmt.Errorf("unexpected VMShutdownMode: %s", mode) + } + + g.GinkgoT().Printf("%s: vm %s (timeout: %v)\n", mode, vm, timeout) + err := shutdownFunc(vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath) + if err != nil { + return err + } + + return services.WaitForVMState(vm, services.VMStateShutOff, timeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath) +} + +func restartVms(dataPair []vmNodePair, c hypervisorExtendedConfig) { + // Start all VMs asynchronously + for _, d := range dataPair { + if err := services.VirshStartVM(d.vm, &c.HypervisorConfig, c.HypervisorKnownHostsPath); err != nil { + fmt.Fprintf(g.GinkgoWriter, "Warning: failed to restart VM %s during cleanup: %v\n", d.vm, err) + } + } + + // Wait for all VMs to be running + for _, d := range dataPair { + err := services.WaitForVMState(d.vm, services.VMStateRunning, vmRestartTimeout, pollInterval, &c.HypervisorConfig, c.HypervisorKnownHostsPath) + o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected VM %s (node: %s) to start in %s timeout", d.vm, d.node, vmRestartTimeout)) + } +} From f918765bd2eaa1e8c84fd78447b013e8d14269ee Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Thu, 30 Oct 2025 19:15:30 +0100 Subject: [PATCH 3/6] test: skip etcd recovery tests when cluster is unhealthy Change BeforeEach health checks to skip tests instead of failing them when the cluster is not in a healthy state at the start of the test. Previously, the etcd recovery tests would fail if the cluster was not healthy before the test started. This is problematic because these tests are designed to validate recovery from intentional disruptions, not to debug pre-existing cluster issues. Changes: - Extract health validation functions to common.go for reusability - Add skipIfClusterIsNotHealthy() to consolidate all health checks - Implement internal retry logic in health check functions with timeouts - Add ensureEtcdHasTwoVotingMembers() to validate membership state - Skip tests early if cluster is degraded, pods aren't running, or members are unhealthy This ensures tests only run when the cluster is in a known-good state, reducing false failures due to pre-existing issues while maintaining test coverage for actual recovery scenarios. --- test/extended/two_node/common.go | 141 +++++++++++++++++++++++++ test/extended/two_node/tnf_recovery.go | 82 +------------- 2 files changed, 145 insertions(+), 78 deletions(-) diff --git a/test/extended/two_node/common.go b/test/extended/two_node/common.go index 6c68d43ea715..e525ffadc898 100644 --- a/test/extended/two_node/common.go +++ b/test/extended/two_node/common.go @@ -3,12 +3,16 @@ package two_node import ( "context" "fmt" + "time" v1 "github.com/openshift/api/config/v1" + "github.com/openshift/origin/test/extended/etcd/helpers" "github.com/openshift/origin/test/extended/util" exutil "github.com/openshift/origin/test/extended/util" + "go.etcd.io/etcd/api/v3/etcdserverpb" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/test/e2e/framework" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" ) @@ -17,6 +21,10 @@ const ( labelNodeRoleControlPlane = "node-role.kubernetes.io/control-plane" labelNodeRoleWorker = "node-role.kubernetes.io/worker" labelNodeRoleArbiter = "node-role.kubernetes.io/arbiter" + + clusterOperatorIsHealthyTimeout = time.Minute + nodeIsHealthyTimeout = time.Minute + resourceIsHealthyTimeout = time.Minute ) func skipIfNotTopology(oc *exutil.CLI, wanted v1.TopologyMode) { @@ -29,6 +37,26 @@ func skipIfNotTopology(oc *exutil.CLI, wanted v1.TopologyMode) { } } +func skipIfClusterIsNotHealthy(oc *util.CLI, ecf *helpers.EtcdClientFactoryImpl, nodes *corev1.NodeList) { + framework.Logf("Ensure Etcd pods are running") + err := ensureEtcdPodsAreRunning(oc) + if err != nil { + e2eskipper.Skip(fmt.Sprintf("could not ensure etcd pods are running: %v", err)) + } + + framework.Logf("Ensure Etcd member list has two voting members") + err = ensureEtcdHasTwoVotingMembers(nodes, ecf) + if err != nil { + e2eskipper.Skip(fmt.Sprintf("could not ensure etcd has two voting members: %v", err)) + } + + framework.Logf("Ensure cluster operator is healthy") + err = ensureClusterOperatorHealthy(oc) + if err != nil { + e2eskipper.Skip(fmt.Sprintf("could not ensure cluster-operator is healthy: %v", err)) + } +} + func isClusterOperatorAvailable(operator *v1.ClusterOperator) bool { for _, cond := range operator.Status.Conditions { if cond.Type == v1.OperatorAvailable && cond.Status == v1.ConditionTrue { @@ -54,3 +82,116 @@ func hasNodeRebooted(oc *util.CLI, node *corev1.Node) (bool, error) { return n.Status.NodeInfo.BootID != node.Status.NodeInfo.BootID, nil } } + +// ensureClusterOperatorHealthy checks if the cluster-etcd-operator is healthy before running etcd tests +func ensureClusterOperatorHealthy(oc *util.CLI) error { + ctx, cancel := context.WithTimeout(context.Background(), clusterOperatorIsHealthyTimeout) + defer cancel() + + var err error + var co *v1.ClusterOperator + for { + if co, err = oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(ctx, "etcd", metav1.GetOptions{}); err != nil { + err = fmt.Errorf("failed to retrieve ClusterOperator: %v", err) + } else { + // Check if etcd operator is Available + available := findClusterOperatorCondition(co.Status.Conditions, v1.OperatorAvailable) + if available == nil { + err = fmt.Errorf("ClusterOperator Available condition not found") + } else if available.Status != v1.ConditionTrue { + err = fmt.Errorf("ClusterOperator is not Available: %s", available.Message) + } else { + // Check if etcd operator is not Degraded + degraded := findClusterOperatorCondition(co.Status.Conditions, v1.OperatorDegraded) + if degraded != nil && degraded.Status == v1.ConditionTrue { + err = fmt.Errorf("ClusterOperator is Degraded: %s", degraded.Message) + } else { + return nil + } + } + } + + select { + case <-ctx.Done(): + return err + default: + } + time.Sleep(pollInterval) + } +} + +func ensureEtcdPodsAreRunning(oc *util.CLI) error { + etcdPods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-etcd").List(context.Background(), metav1.ListOptions{ + LabelSelector: "app=etcd", + }) + if err != nil { + return fmt.Errorf("failed to retrieve etcd pods: %v", err) + } + + runningPods := 0 + for _, pod := range etcdPods.Items { + if pod.Status.Phase == corev1.PodRunning { + runningPods++ + } + } + + if runningPods < 2 { + return fmt.Errorf("expected at least 2 etcd pods running, found %d", runningPods) + } + + return nil +} + +// findClusterOperatorCondition finds a condition in ClusterOperator status +func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition, conditionType v1.ClusterStatusConditionType) *v1.ClusterOperatorStatusCondition { + for i := range conditions { + if conditions[i].Type == conditionType { + return &conditions[i] + } + } + return nil +} + +func ensureEtcdHasTwoVotingMembers(nodes *corev1.NodeList, ecf *helpers.EtcdClientFactoryImpl) error { + ctx, cancel := context.WithTimeout(context.Background(), resourceIsHealthyTimeout) + defer cancel() + + for { + var err error + var members []*etcdserverpb.Member + + // Check all conditions sequentially + members, err = getMembers(ecf) + if err == nil && len(members) != 2 { + err = fmt.Errorf("expected 2 members, found %d", len(members)) + } + + if err == nil { + for _, node := range nodes.Items { + isStarted, isLearner, checkErr := getMemberState(&node, members) + if checkErr != nil { + err = checkErr + } else if !isStarted || isLearner { + err = fmt.Errorf("member %s is not a voting member (started=%v, learner=%v)", + node.Name, isStarted, isLearner) + break + } + } + + } + + // All checks passed - success! + if err == nil { + framework.Logf("SUCCESS: got membership with two voting members: %+v", members) + return nil + } + + // Checks failed - evaluate timeout + select { + case <-ctx.Done(): + return fmt.Errorf("etcd membership does not have two voters: %v, membership: %+v", err, members) + default: + } + time.Sleep(pollInterval) + } +} diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index fe8ba82cdf50..3466fb78478c 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -24,8 +24,6 @@ import ( ) const ( - nodeIsHealthyTimeout = time.Minute - etcdOperatorIsHealthyTimeout = time.Minute memberHasLeftTimeout = 5 * time.Minute memberIsLeaderTimeout = 10 * time.Minute memberRejoinedLearnerTimeout = 10 * time.Minute @@ -55,32 +53,19 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual g.BeforeEach(func() { skipIfNotTopology(oc, v1.DualReplicaTopologyMode) - g.By("Verifying etcd cluster operator is healthy before starting test") - o.Eventually(func() error { - return ensureEtcdOperatorHealthy(oc) - }, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster operator should be healthy before starting test") - nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") o.Expect(len(nodes.Items)).To(o.BeNumerically("==", 2), "Expected to find 2 Nodes only") + etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient()) + + skipIfClusterIsNotHealthy(oc, etcdClientFactory, nodes) + // Select the first index randomly randomIndex := rand.Intn(len(nodes.Items)) peerNode = nodes.Items[randomIndex] // Select the remaining index targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)] - - kubeClient := oc.KubeClient() - etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient) - - g.GinkgoT().Printf("Ensure both nodes are healthy before starting the test\n") - o.Eventually(func() error { - return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, peerNode.Name) - }, nodeIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("expect to ensure Node '%s' healthiness without errors", peerNode.Name)) - - o.Eventually(func() error { - return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, targetNode.Name) - }, nodeIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("expect to ensure Node '%s' healthiness without errors", targetNode.Name)) }) g.It("should recover from graceful node shutdown with etcd member re-addition", func() { @@ -379,65 +364,6 @@ func getMemberState(node *corev1.Node, members []*etcdserverpb.Member) (started, return started, learner, nil } -// ensureEtcdOperatorHealthy checks if the cluster-etcd-operator is healthy before running etcd tests -func ensureEtcdOperatorHealthy(oc *util.CLI) error { - g.By("Checking etcd ClusterOperator status") - etcdOperator, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(context.Background(), "etcd", metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to retrieve etcd ClusterOperator: %v", err) - } - - // Check if etcd operator is Available - available := findClusterOperatorCondition(etcdOperator.Status.Conditions, v1.OperatorAvailable) - if available == nil || available.Status != v1.ConditionTrue { - return fmt.Errorf("etcd ClusterOperator is not Available: %v", available) - } - - // Check if etcd operator is not Degraded - degraded := findClusterOperatorCondition(etcdOperator.Status.Conditions, v1.OperatorDegraded) - if degraded != nil && degraded.Status == v1.ConditionTrue { - return fmt.Errorf("etcd ClusterOperator is Degraded: %s", degraded.Message) - } - - // Check if etcd operator is not Progressing (optional - might be ok during normal operations) - progressing := findClusterOperatorCondition(etcdOperator.Status.Conditions, v1.OperatorProgressing) - if progressing != nil && progressing.Status == v1.ConditionTrue { - g.GinkgoT().Logf("Warning: etcd ClusterOperator is Progressing: %s", progressing.Message) - } - - g.By("Checking etcd pods are running") - etcdPods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-etcd").List(context.Background(), metav1.ListOptions{ - LabelSelector: "app=etcd", - }) - if err != nil { - return fmt.Errorf("failed to retrieve etcd pods: %v", err) - } - - runningPods := 0 - for _, pod := range etcdPods.Items { - if pod.Status.Phase == corev1.PodRunning { - runningPods++ - } - } - - if runningPods < 2 { - return fmt.Errorf("expected at least 2 etcd pods running, found %d", runningPods) - } - - g.GinkgoT().Logf("etcd cluster operator is healthy: Available=True, Degraded=False, %d pods running", runningPods) - return nil -} - -// findClusterOperatorCondition finds a condition in ClusterOperator status -func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition, conditionType v1.ClusterStatusConditionType) *v1.ClusterOperatorStatusCondition { - for i := range conditions { - if conditions[i].Type == conditionType { - return &conditions[i] - } - } - return nil -} - // validateEtcdRecoveryState polls the etcd cluster until the members match the expected state or a timeout is reached. // // This function assumes that the first node argument is always expected to be a healthy, voting member (isStarted=true, isLearner=false). From e5063f45661949ee3463965adb26fef99c1ff2b7 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Mon, 3 Nov 2025 09:45:30 +0100 Subject: [PATCH 4/6] Enhance initial check implementing a timeout for the etcd pods count Also unified timeouts for the initial checks and improved logging --- test/extended/two_node/common.go | 67 +++++++++++++++++--------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/test/extended/two_node/common.go b/test/extended/two_node/common.go index e525ffadc898..0480d0c2192f 100644 --- a/test/extended/two_node/common.go +++ b/test/extended/two_node/common.go @@ -9,7 +9,6 @@ import ( "github.com/openshift/origin/test/extended/etcd/helpers" "github.com/openshift/origin/test/extended/util" exutil "github.com/openshift/origin/test/extended/util" - "go.etcd.io/etcd/api/v3/etcdserverpb" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/kubernetes/test/e2e/framework" @@ -22,9 +21,7 @@ const ( labelNodeRoleWorker = "node-role.kubernetes.io/worker" labelNodeRoleArbiter = "node-role.kubernetes.io/arbiter" - clusterOperatorIsHealthyTimeout = time.Minute - nodeIsHealthyTimeout = time.Minute - resourceIsHealthyTimeout = time.Minute + clusterIsHealthyTimeout = 5 * time.Minute ) func skipIfNotTopology(oc *exutil.CLI, wanted v1.TopologyMode) { @@ -38,19 +35,16 @@ func skipIfNotTopology(oc *exutil.CLI, wanted v1.TopologyMode) { } func skipIfClusterIsNotHealthy(oc *util.CLI, ecf *helpers.EtcdClientFactoryImpl, nodes *corev1.NodeList) { - framework.Logf("Ensure Etcd pods are running") err := ensureEtcdPodsAreRunning(oc) if err != nil { e2eskipper.Skip(fmt.Sprintf("could not ensure etcd pods are running: %v", err)) } - framework.Logf("Ensure Etcd member list has two voting members") err = ensureEtcdHasTwoVotingMembers(nodes, ecf) if err != nil { e2eskipper.Skip(fmt.Sprintf("could not ensure etcd has two voting members: %v", err)) } - framework.Logf("Ensure cluster operator is healthy") err = ensureClusterOperatorHealthy(oc) if err != nil { e2eskipper.Skip(fmt.Sprintf("could not ensure cluster-operator is healthy: %v", err)) @@ -85,13 +79,13 @@ func hasNodeRebooted(oc *util.CLI, node *corev1.Node) (bool, error) { // ensureClusterOperatorHealthy checks if the cluster-etcd-operator is healthy before running etcd tests func ensureClusterOperatorHealthy(oc *util.CLI) error { - ctx, cancel := context.WithTimeout(context.Background(), clusterOperatorIsHealthyTimeout) + framework.Logf("Ensure cluster operator is healthy (timeout: %v)", clusterIsHealthyTimeout) + ctx, cancel := context.WithTimeout(context.Background(), clusterIsHealthyTimeout) defer cancel() - var err error - var co *v1.ClusterOperator for { - if co, err = oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(ctx, "etcd", metav1.GetOptions{}); err != nil { + co, err := oc.AdminConfigClient().ConfigV1().ClusterOperators().Get(ctx, "etcd", metav1.GetOptions{}) + if err != nil { err = fmt.Errorf("failed to retrieve ClusterOperator: %v", err) } else { // Check if etcd operator is Available @@ -106,6 +100,7 @@ func ensureClusterOperatorHealthy(oc *util.CLI) error { if degraded != nil && degraded.Status == v1.ConditionTrue { err = fmt.Errorf("ClusterOperator is Degraded: %s", degraded.Message) } else { + framework.Logf("SUCCESS: Cluster operator is healthy") return nil } } @@ -121,25 +116,37 @@ func ensureClusterOperatorHealthy(oc *util.CLI) error { } func ensureEtcdPodsAreRunning(oc *util.CLI) error { - etcdPods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-etcd").List(context.Background(), metav1.ListOptions{ - LabelSelector: "app=etcd", - }) - if err != nil { - return fmt.Errorf("failed to retrieve etcd pods: %v", err) - } + framework.Logf("Ensure Etcd pods are running (timeout: %v)", clusterIsHealthyTimeout) + ctx, cancel := context.WithTimeout(context.Background(), clusterIsHealthyTimeout) + defer cancel() + for { + etcdPods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-etcd").List(context.Background(), metav1.ListOptions{ + LabelSelector: "app=etcd", + }) + if err != nil { + err = fmt.Errorf("failed to retrieve etcd pods: %v", err) + } else { + runningPods := 0 + for _, pod := range etcdPods.Items { + if pod.Status.Phase == corev1.PodRunning { + runningPods++ + } + } + if runningPods < 2 { + return fmt.Errorf("expected at least 2 etcd pods running, found %d", runningPods) + } - runningPods := 0 - for _, pod := range etcdPods.Items { - if pod.Status.Phase == corev1.PodRunning { - runningPods++ + framework.Logf("SUCCESS: found the 2 expected Etcd pods") + return nil } - } - if runningPods < 2 { - return fmt.Errorf("expected at least 2 etcd pods running, found %d", runningPods) + select { + case <-ctx.Done(): + return err + default: + } + time.Sleep(pollInterval) } - - return nil } // findClusterOperatorCondition finds a condition in ClusterOperator status @@ -153,15 +160,13 @@ func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition } func ensureEtcdHasTwoVotingMembers(nodes *corev1.NodeList, ecf *helpers.EtcdClientFactoryImpl) error { - ctx, cancel := context.WithTimeout(context.Background(), resourceIsHealthyTimeout) + framework.Logf("Ensure Etcd member list has two voting members (timeout: %v)", clusterIsHealthyTimeout) + ctx, cancel := context.WithTimeout(context.Background(), clusterIsHealthyTimeout) defer cancel() for { - var err error - var members []*etcdserverpb.Member - // Check all conditions sequentially - members, err = getMembers(ecf) + members, err := getMembers(ecf) if err == nil && len(members) != 2 { err = fmt.Errorf("expected 2 members, found %d", len(members)) } From b02cf1a7cba51e624941ce89bcfc5ef7586909c4 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Mon, 3 Nov 2025 09:47:45 +0100 Subject: [PATCH 5/6] Extend recovery timeouts --- test/extended/two_node/tnf_recovery.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index 3466fb78478c..4c45caebea86 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -25,14 +25,14 @@ import ( const ( memberHasLeftTimeout = 5 * time.Minute - memberIsLeaderTimeout = 10 * time.Minute + memberIsLeaderTimeout = 20 * time.Minute memberRejoinedLearnerTimeout = 10 * time.Minute memberPromotedVotingTimeout = 15 * time.Minute networkDisruptionDuration = 15 * time.Second vmRestartTimeout = 5 * time.Minute vmUngracefulShutdownTimeout = 30 * time.Second // Ungraceful shutdown is typically fast vmGracefulShutdownTimeout = 10 * time.Minute // Graceful shutdown is typically slow - membersHealthyAfterDoubleReboot = 15 * time.Minute // It takes into account full VM reboot and Etcd member healthy + membersHealthyAfterDoubleReboot = 30 * time.Minute // It takes into account full VM reboot and Etcd member healthy pollInterval = 5 * time.Second ) From 90b1da64ec37d08437aa4a82954bc3a303f03494 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Fri, 14 Nov 2025 10:11:59 +0100 Subject: [PATCH 6/6] Add skipped tag with known issue reference --- test/extended/two_node/tnf_recovery.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index 4c45caebea86..4bf0e848eae1 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -167,9 +167,10 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual memberPromotedVotingTimeout, pollInterval) }) - g.It("should recover from a double node failure (cold-boot)", func() { + g.It("should recover from a double node failure (cold-boot) [Skipped:KnownIssue]", func() { // Note: In a double node failure both nodes have the same role, hence we // will call them just NodeA and NodeB + // Currently skipped due to OCPBUGS-59238: rapid podman-etcd restart fails on unpatched clusters nodeA := peerNode nodeB := targetNode c, vmA, vmB, err := setupMinimalTestEnvironment(oc, &nodeA, &nodeB) @@ -205,8 +206,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual membersHealthyAfterDoubleReboot, pollInterval) }) - g.It("should recover from double graceful node shutdown (cold-boot)", func() { + g.It("should recover from double graceful node shutdown (cold-boot) [Skipped:KnownIssue]", func() { // Note: Both nodes are gracefully shut down, then both restart + // Currently skipped due to OCPBUGS-59238: rapid podman-etcd restart fails on unpatched clusters nodeA := peerNode nodeB := targetNode g.GinkgoT().Printf("Testing double node graceful shutdown for %s and %s\n", nodeA.Name, nodeB.Name) @@ -243,8 +245,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual membersHealthyAfterDoubleReboot, pollInterval) }) - g.It("should recover from sequential graceful node shutdowns (cold-boot)", func() { + g.It("should recover from sequential graceful node shutdowns (cold-boot) [Skipped:KnownIssue]", func() { // Note: First node is gracefully shut down, then the second, then both restart + // Currently skipped due to OCPBUGS-59238: rapid podman-etcd restart fails on unpatched clusters firstToShutdown := peerNode secondToShutdown := targetNode g.GinkgoT().Printf("Testing sequential graceful shutdowns: first %s, then %s\n", @@ -280,8 +283,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual membersHealthyAfterDoubleReboot, pollInterval) }) - g.It("should recover from graceful shutdown followed by ungraceful node failure (cold-boot)", func() { + g.It("should recover from graceful shutdown followed by ungraceful node failure (cold-boot) [Skipped:KnownIssue]", func() { // Note: First node is gracefully shut down, then the survived node fails ungracefully + // Currently skipped due to OCPBUGS-59238: rapid podman-etcd restart fails on unpatched clusters firstToShutdown := targetNode secondToShutdown := peerNode g.GinkgoT().Printf("Randomly selected %s to shutdown gracefully and %s to survive, then fail ungracefully\n",