diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index b8a65e1ba223..fde52d48291d 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -78,6 +78,11 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual peerNode = nodes.Items[randomIndex] // Select the remaining index targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)] + + // Log final pcs and etcd status after every test (pass or fail) via SSH + g.DeferCleanup(func() { + logFinalClusterStatus([]corev1.Node{peerNode, targetNode}) + }) }) g.It("should recover from graceful node shutdown with etcd member re-addition", func() { @@ -778,6 +783,74 @@ func restartVms(dataPair []vmNodePair, c hypervisorExtendedConfig) { } } +// logFinalClusterStatus logs pcs status and etcd member list via SSH after every test +// (pass or fail). Uses the hypervisor SSH path because the Kubernetes API may not be +// available after a recovery test. Errors are logged but never fail the test. +func logFinalClusterStatus(nodes []corev1.Node) { + if !exutil.HasHypervisorConfig() { + return + } + + sshConfig := exutil.GetHypervisorConfig() + hypervisorConfig := core.SSHConfig{ + IP: sshConfig.HypervisorIP, + User: sshConfig.SSHUser, + PrivateKeyPath: sshConfig.PrivateKeyPath, + } + + if _, err := os.Stat(hypervisorConfig.PrivateKeyPath); os.IsNotExist(err) { + framework.Logf("Skipping final cluster status: private key not found at %s", hypervisorConfig.PrivateKeyPath) + return + } + + knownHostsPath, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig) + if err != nil { + framework.Logf("Skipping final cluster status: failed to prepare known hosts: %v", err) + return + } + + framework.Logf("========== FINAL CLUSTER STATUS ==========") + + for _, node := range nodes { + nodeIP := utils.GetNodeInternalIP(&node) + if nodeIP == "" { + framework.Logf("Skipping node %s: no internal IP", node.Name) + continue + } + + remoteKnownHostsPath, err := core.PrepareRemoteKnownHostsFile(nodeIP, &hypervisorConfig, knownHostsPath) + if err != nil { + framework.Logf("Failed to prepare remote known hosts for node %s: %v", node.Name, err) + continue + } + + // pcs status + pcsOutput, pcsStderr, pcsErr := services.PcsStatus(nodeIP, &hypervisorConfig, knownHostsPath, remoteKnownHostsPath) + if pcsErr != nil { + framework.Logf("Failed to get pcs status from node %s: %v\nstdout: %s\nstderr: %s", node.Name, pcsErr, pcsOutput, pcsStderr) + } else { + framework.Logf("pcs status from node %s:\n%s", node.Name, pcsOutput) + } + + // etcd member list via SSH (-w table is the etcdctl v3 flag for table output) + etcdOutput, etcdStderr, etcdErr := core.ExecuteRemoteSSHCommand(nodeIP, + "sudo podman exec etcd etcdctl member list -w table", + &hypervisorConfig, knownHostsPath, remoteKnownHostsPath) + if etcdErr != nil { + framework.Logf("Failed to get etcd member list from node %s: %v\nstdout: %s\nstderr: %s", node.Name, etcdErr, etcdOutput, etcdStderr) + } else { + framework.Logf("etcd member list from node %s:\n%s", node.Name, etcdOutput) + } + + // Only need one successful node for cluster-wide status + if pcsErr == nil && etcdErr == nil { + break + } + } + + framework.Logf("========== END FINAL CLUSTER STATUS ==========") +} + // deferDiagnosticsOnFailure registers a DeferCleanup handler that gathers diagnostic // information when the current test spec fails. This should be called early in test // setup to ensure diagnostics are collected on any failure.