Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions test/extended/two_node/tnf_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
peerNode = nodes.Items[randomIndex]
// Select the remaining index
targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)]

// Log final pcs and etcd status after every test (pass or fail) via SSH
g.DeferCleanup(func() {
logFinalClusterStatus([]corev1.Node{peerNode, targetNode})
})
})

g.It("should recover from graceful node shutdown with etcd member re-addition", func() {
Expand Down Expand Up @@ -778,6 +783,74 @@ func restartVms(dataPair []vmNodePair, c hypervisorExtendedConfig) {
}
}

// logFinalClusterStatus logs pcs status and etcd member list via SSH after every test
// (pass or fail). Uses the hypervisor SSH path because the Kubernetes API may not be
// available after a recovery test. Errors are logged but never fail the test.
func logFinalClusterStatus(nodes []corev1.Node) {
if !exutil.HasHypervisorConfig() {
return
}

sshConfig := exutil.GetHypervisorConfig()
hypervisorConfig := core.SSHConfig{
IP: sshConfig.HypervisorIP,
User: sshConfig.SSHUser,
PrivateKeyPath: sshConfig.PrivateKeyPath,
}

if _, err := os.Stat(hypervisorConfig.PrivateKeyPath); os.IsNotExist(err) {
framework.Logf("Skipping final cluster status: private key not found at %s", hypervisorConfig.PrivateKeyPath)
return
}

knownHostsPath, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig)
if err != nil {
framework.Logf("Skipping final cluster status: failed to prepare known hosts: %v", err)
return
}

framework.Logf("========== FINAL CLUSTER STATUS ==========")

for _, node := range nodes {
nodeIP := utils.GetNodeInternalIP(&node)
if nodeIP == "" {
framework.Logf("Skipping node %s: no internal IP", node.Name)
continue
}

remoteKnownHostsPath, err := core.PrepareRemoteKnownHostsFile(nodeIP, &hypervisorConfig, knownHostsPath)
if err != nil {
framework.Logf("Failed to prepare remote known hosts for node %s: %v", node.Name, err)
continue
}

// pcs status
pcsOutput, pcsStderr, pcsErr := services.PcsStatus(nodeIP, &hypervisorConfig, knownHostsPath, remoteKnownHostsPath)
if pcsErr != nil {
framework.Logf("Failed to get pcs status from node %s: %v\nstdout: %s\nstderr: %s", node.Name, pcsErr, pcsOutput, pcsStderr)
} else {
framework.Logf("pcs status from node %s:\n%s", node.Name, pcsOutput)
}

// etcd member list via SSH (-w table is the etcdctl v3 flag for table output)
etcdOutput, etcdStderr, etcdErr := core.ExecuteRemoteSSHCommand(nodeIP,
"sudo podman exec etcd etcdctl member list -w table",
&hypervisorConfig, knownHostsPath, remoteKnownHostsPath)
if etcdErr != nil {
framework.Logf("Failed to get etcd member list from node %s: %v\nstdout: %s\nstderr: %s", node.Name, etcdErr, etcdOutput, etcdStderr)
} else {
framework.Logf("etcd member list from node %s:\n%s", node.Name, etcdOutput)
}

// Only need one successful node for cluster-wide status
if pcsErr == nil && etcdErr == nil {
break
}
}

framework.Logf("========== END FINAL CLUSTER STATUS ==========")
}

// deferDiagnosticsOnFailure registers a DeferCleanup handler that gathers diagnostic
// information when the current test spec fails. This should be called early in test
// setup to ensure diagnostics are collected on any failure.
Expand Down