From 66ede7b71e8ba8243ecb92bd794672f6ebb135d3 Mon Sep 17 00:00:00 2001 From: Gregory Giguashvili Date: Mon, 6 Mar 2023 18:32:58 +0000 Subject: [PATCH] Add user workload health check docs to Greenboot procedures --- docs/config/busybox_running_check.sh | 68 +++++++++ docs/greenboot_dev.md | 68 ++++++++- packaging/greenboot/functions.sh | 134 ++++++++++++++++++ .../greenboot/microshift-pre-rollback.sh | 0 .../greenboot/microshift-running-check.sh | 120 +--------------- packaging/rpm/microshift.spec | 3 + 6 files changed, 275 insertions(+), 118 deletions(-) create mode 100644 docs/config/busybox_running_check.sh create mode 100644 packaging/greenboot/functions.sh mode change 100644 => 100755 packaging/greenboot/microshift-pre-rollback.sh diff --git a/docs/config/busybox_running_check.sh b/docs/config/busybox_running_check.sh new file mode 100644 index 0000000000..87b804fb56 --- /dev/null +++ b/docs/config/busybox_running_check.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -e + +SCRIPT_NAME=$(basename $0) +PODS_NS_LIST=(busybox) +PODS_CT_LIST=(3 ) + +# Source the MicroShift health check functions library +source /usr/share/microshift/functions/greenboot.sh + +# Set the exit handler to log the exit status +trap 'script_exit' EXIT + +# The script exit handler logging the FAILURE or FINISHED message depending +# on the exit status of the last command +# +# args: None +# return: None +function script_exit() { + [ "$?" -ne 0 ] && status=FAILURE || status=FINISHED + echo $status +} + +# +# Main +# + +# Exit if the current user is not 'root' +if [ $(id -u) -ne 0 ] ; then + echo "The '${SCRIPT_NAME}' script must be run with the 'root' user privileges" + exit 1 +fi + +echo "STARTED" + +# Exit if the MicroShift service is not enabled +if [ $(systemctl is-enabled microshift.service 2>/dev/null) != "enabled" ] ; then + echo "MicroShift service is not enabled. Exiting..." + exit 0 +fi + +# Set the wait timeout for the current check based on the boot counter +WAIT_TIMEOUT_SECS=$(get_wait_timeout) + +# Wait for pod images to be downloaded +for i in ${!PODS_NS_LIST[@]}; do + CHECK_PODS_NS=${PODS_NS_LIST[$i]} + + echo "Waiting ${WAIT_TIMEOUT_SECS}s for pod image(s) from the '${CHECK_PODS_NS}' namespace to be downloaded" + wait_for ${WAIT_TIMEOUT_SECS} namespace_images_downloaded +done + +# Wait for pods to enter ready state +for i in ${!PODS_NS_LIST[@]}; do + CHECK_PODS_NS=${PODS_NS_LIST[$i]} + CHECK_PODS_CT=${PODS_CT_LIST[$i]} + + echo "Waiting ${WAIT_TIMEOUT_SECS}s for ${CHECK_PODS_CT} pod(s) from the '${CHECK_PODS_NS}' namespace to be in 'Ready' state" + wait_for ${WAIT_TIMEOUT_SECS} namespace_pods_ready +done + +# Verify that pods are not restarting +for i in ${!PODS_NS_LIST[@]}; do + CHECK_PODS_NS=${PODS_NS_LIST[$i]} + + echo "Checking pod restart count in the '${CHECK_PODS_NS}' namespace" + namespace_pods_not_restarting ${CHECK_PODS_NS} +done diff --git a/docs/greenboot_dev.md b/docs/greenboot_dev.md index 3eda4dc1ad..7dd01b681a 100644 --- a/docs/greenboot_dev.md +++ b/docs/greenboot_dev.md @@ -3,10 +3,68 @@ ## Motivation [Integrating MicroShift with Greenboot](./greenboot.md) allows for automatic -software upgrade rollbacks in case of a failure. The current document describes -a few techniques for simulating software upgrade failures in a development -environment. These guidelines can be used by developers for implementing CI/CD -pipelines testing MicroShift integration with Greenboot. +software upgrade rollbacks in case of a failure. + +The current document describes a few techniques for: +* Adding user workload health check procedures in a production environment +* Simulating software upgrade failures in a development environment + +These guidelines can be used by developers for implementing user workload +health check using Greenboot facilities, as well as simulating failures for +testing MicroShift integration with Greenboot in CI/CD pipelines. + +## User Workload Health + +### Installation + +Follow the instructions in [Auto-applying Manifests](./howto_config.md#auto-applying-manifests) +section to install a dummy user workload, without restarting the MicroShift service +at this time. + +Proceed by creating a health check script in the `/etc/greenboot/check/required.d` +directory. +> The name prefix of the user script should be chosen to make sure it runs after +> the `40_microshift_running_check.sh` script, which implements the MicroShift +> health check procedure for its core services. + +``` +SCRIPT_FILE=/etc/greenboot/check/required.d/50_busybox_running_check.sh +sudo curl -s https://raw.githubusercontent.com/openshift/microshift/main/docs/config/busybox_running_check.sh \ + -o ${SCRIPT_FILE} && echo SUCCESS || echo ERROR +sudo chmod 755 ${SCRIPT_FILE} +``` + +### Testing + +Reboot the system and run the following command to examine the output of the +Greenboot health checks. Note that the MicroShift core service health checks +are running before the user workload health checks. + +```bash +sudo journalctl -o cat -u greenboot-healthcheck.service +``` + +### Health Check Implementation + +The script utilizes the MicroShift health check functions that are available +in the `/usr/share/microshift/functions/greenboot.sh` file to reuse procedures +already implemented for the MicroShift core services. These functions need a +definition of the user workload namespaces and the expected count of pods. + +```bash +PODS_NS_LIST=(busybox) +PODS_CT_LIST=(3 ) +``` + +The script starts by running sanity checks to verify that it is executed from +the `root` account and that the MicroShift service is enabled. + +Finally, the MicroShift health check functions are called to perform the +following actions: +- Get a wait timeout of the current boot cycle for the `wait_for` function +- Call the `namespace_images_downloaded` function to wait until pod images are available +- Call the `namespace_pods_ready` function to wait until pods are ready +- Call the `namespace_pods_not_restarting` function to verify pods are not restarting ## MicroShift Service Failure @@ -69,7 +127,7 @@ sudo rpm-ostree cleanup -b -r ## MicroShift Pod Failure -To simulate a situation with the MicroShift pod failure after an upgrade, +To simulate a situation with the MicroShift pod failure after an upgrade, one can set the `network.serviceNetwork` MicroShift configuration option to a non-default `10.66.0.0/16` value without resetting the MicroShift data at the `/var/lib/microshift` directory. diff --git a/packaging/greenboot/functions.sh b/packaging/greenboot/functions.sh new file mode 100644 index 0000000000..5fde8d1650 --- /dev/null +++ b/packaging/greenboot/functions.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# +# Functions used by MicroShift in Greenboot health check procedures. +# This library may also be used for user workload health check verification. +# +SCRIPT_PID=$$ + +OCCONFIG_OPT="--kubeconfig /var/lib/microshift/resources/kubeadmin/kubeconfig" +OCGET_OPT="--no-headers" +OCGET_CMD="oc get ${OCCONFIG_OPT}" + +# Get the recommended wait timeout to be used for running health check operations. +# The returned timeout is a product of a base value and a boot attempt counter, so +# that the timeout increases after every boot attempt. +# +# The base value for the timeout and the maximum boot attempts can be defined in +# the /etc/greenboot/greenboot.conf file using the MICROSHIFT_WAIT_TIMEOUT_SEC +# and GREENBOOT_MAX_BOOTS settings. +# +# args: None +# return: Print the recommended timeout value to stdout +function get_wait_timeout() { + # Source Greenboot configuration file if it exists + local conf_file=/etc/greenboot/greenboot.conf + [ -f "${conf_file}" ] && source ${conf_file} + local base_timeout=${MICROSHIFT_WAIT_TIMEOUT_SEC:-300} + + # Update the wait timeout according to the boot counter. + # The new wait timeout is a product of the timeout base and the number of boot attempts. + local max_boots=${GREENBOOT_MAX_BOOTS:-3} + local boot_counter=$(grub2-editenv - list | grep ^boot_counter= | awk -F= '{print $2}') + [ -z "${boot_counter}" ] && boot_counter=$(( $max_boots - 1 )) + + local wait_timeout=$(( $base_timeout * ( $max_boots - $boot_counter ) )) + [ ${wait_timeout} -le 0 ] && wait_timeout=${base_timeout} + + echo $wait_timeout +} + +# Run a command with a second delay until it returns a zero exit status +# +# arg1: Time in seconds to wait for a command to succeed +# argN: Command to run with optional arguments +# return: 0 if a command ran successfully within the wait period, or 1 otherwise +function wait_for() { + local timeout=$1 + shift 1 + + local start=$(date +%s) + until ("$@"); do + sleep 1 + + local now=$(date +%s) + [ $(( now - start )) -ge $timeout ] && return 1 + done + + return 0 +} + +# Check if all the pod images in a given namespace are downloaded. +# +# args: None +# env1: 'CHECK_PODS_NS' environment variable for the namespace to check +# return: 0 if all the images in a given namespace are downloaded, or 1 otherwise +function namespace_images_downloaded() { + local ns=${CHECK_PODS_NS} + + local images=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o jsonpath="{.items[*].spec.containers[*].image}" 2>/dev/null) + for i in ${images} ; do + # Return an error on the first missing image + local cimage=$(crictl image -q ${i}) + [ -z "${cimage}" ] && return 1 + done + + return 0 +} + +# Check if a given number of pods in a given namespace are in the 'Ready' status, +# terminating the script with the SIGTERM signal if more pods are ready than expected. +# +# args: None +# env1: 'CHECK_PODS_NS' environment variable for the namespace to check +# env2: 'CHECK_PODS_CT' environment variable for the pod count to check +# return: 0 if the expected number of pods are ready, or 1 otherwise +function namespace_pods_ready() { + local ns=${CHECK_PODS_NS} + local ct=${CHECK_PODS_CT} + + local status=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' 2>/dev/null) + local tcount=$(echo $status | grep -o True | wc -l) + local fcount=$(echo $status | grep -o False | wc -l) + + # Terminate the script in case more pods are ready than expected - nothing to wait for + if [ "${tcount}" -gt "${ct}" ] ; then + echo "The number of ready pods in the '${ns}' namespace is greater than the expected '${ct}' count. Terminating..." + kill -TERM ${SCRIPT_PID} + fi + # Exit with error if any pods are not ready yet + [ "${fcount}" -gt 0 ] && return 1 + # Check the ready pod count + [ "${tcount}" -eq "${ct}" ] && return 0 + return 1 +} + +# Check if MicroShift pods in a given namespace started and verify they are not restarting by sampling +# the pod restart count 10 times every 5 seconds and comparing the current sample with the previous one. +# The pods are considered restarting if the number of 'pod-restarting' samples is greater than the +# number of 'pod-not-restarting' ones. +# +# arg1: Name of the namespace to check +# return: 0 if pods are not restarting, or 1 otherwise +function namespace_pods_not_restarting() { + local ns=$1 + local restarts=0 + + local count1=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null) + for i in $(seq 10) ; do + sleep 5 + local countS=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].started}' 2>/dev/null | grep -vc false) + local count2=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null) + + # If pods started, a restart is detected by comparing the count string between the checks. + # The number of pod restarts is incremented when a restart is detected, or decremented otherwise. + if [ "${countS}" -ne 0 ] && [ "${count1}" = "${count2}" ] ; then + restarts=$(( restarts - 1 )) + else + restarts=$(( restarts + 1 )) + count1=${count2} + fi + done + + [ "${restarts}" -lt 0 ] && return 0 + return 1 +} diff --git a/packaging/greenboot/microshift-pre-rollback.sh b/packaging/greenboot/microshift-pre-rollback.sh old mode 100644 new mode 100755 diff --git a/packaging/greenboot/microshift-running-check.sh b/packaging/greenboot/microshift-running-check.sh index a0eba76943..1aa1978418 100755 --- a/packaging/greenboot/microshift-running-check.sh +++ b/packaging/greenboot/microshift-running-check.sh @@ -3,15 +3,11 @@ set -e SCRIPT_NAME=$(basename $0) SCRIPT_PID=$$ -OCGET_CMD="oc get --kubeconfig /var/lib/microshift/resources/kubeadmin/kubeconfig" -OCGET_OPT="--no-headers" PODS_NS_LIST=(openshift-ovn-kubernetes openshift-service-ca openshift-ingress openshift-dns openshift-storage) PODS_CT_LIST=(2 1 1 2 2) -# Source Greenboot configuration file if it exists -GREENBOOT_CONF_FILE=/etc/greenboot/greenboot.conf -[ -f "${GREENBOOT_CONF_FILE}" ] && source ${GREENBOOT_CONF_FILE} -WAIT_TIMEOUT_SECS_BASE=${MICROSHIFT_WAIT_TIMEOUT_SEC:-300} +# Source the MicroShift health check functions library +source /usr/share/microshift/functions/greenboot.sh # Set the exit handler to log the exit status trap 'script_exit' EXIT @@ -26,26 +22,6 @@ function script_exit() { echo $status } -# Run a command with a second delay until it returns a zero exit status -# -# arg1: Time in seconds to wait for a command to succeed -# argN: Command to run with optional arguments -# return: 0 if a command ran successfully within the wait period, or 1 otherwise -function wait_for() { - local timeout=$1 - shift 1 - - local start=$(date +%s) - until ("$@"); do - sleep 1 - - local now=$(date +%s) - [ $(( now - start )) -ge $timeout ] && return 1 - done - - return 0 -} - # Check the microshift.service systemd unit activity, terminating the script # with the SIGTERM signal if the unit reports a failed state # @@ -66,7 +42,7 @@ function microshift_service_active() { } # Check if MicroShift API 'readyz' and 'livez' health endpoints are OK -# +# # args: None # return: 0 if all API health endpoints are OK, or 1 otherwise function microshift_health_endpoints_ok() { @@ -79,7 +55,7 @@ function microshift_health_endpoints_ok() { } # Check if any MicroShift pods are in the 'Running' status -# +# # args: None # return: 0 if any pods are in the 'Running' status, or 1 otherwise function any_pods_running() { @@ -89,82 +65,6 @@ function any_pods_running() { return 1 } -# Check if all the MicroShift pod images in a given namespace are downloaded. -# -# args: None -# env1: 'CHECK_PODS_NS' environment variable for the namespace to check -# return: 0 if all the images in a given namespace are downloaded, or 1 otherwise -function namespace_images_downloaded() { - local ns=${CHECK_PODS_NS} - - local images=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o jsonpath="{.items[*].spec.containers[*].image}" 2>/dev/null) - for i in ${images} ; do - # Return an error on the first missing image - local cimage=$(crictl image -q ${i}) - [ -z "${cimage}" ] && return 1 - done - - return 0 -} - -# Check if a given number of MicroShift pods in a given namespace are in the 'Ready' status, -# terminating the script with the SIGTERM signal if more pods are ready than expected. -# -# args: None -# env1: 'CHECK_PODS_NS' environment variable for the namespace to check -# env2: 'CHECK_PODS_CT' environment variable for the pod count to check -# return: 0 if the expected number of pods are ready, or 1 otherwise -function namespace_pods_ready() { - local ns=${CHECK_PODS_NS} - local ct=${CHECK_PODS_CT} - - local status=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' 2>/dev/null) - local tcount=$(echo $status | grep -o True | wc -l) - local fcount=$(echo $status | grep -o False | wc -l) - - # Terminate the script in case more pods are ready than expected - nothing to wait for - if [ "${tcount}" -gt "${ct}" ] ; then - echo "The number of ready pods in the '${ns}' namespace is greater than the expected '${ct}' count. Terminating..." - kill -TERM ${SCRIPT_PID} - fi - # Exit with error if any pods are not ready yet - [ "${fcount}" -gt 0 ] && return 1 - # Check the ready pod count - [ "${tcount}" -eq "${ct}" ] && return 0 - return 1 -} - -# Check if MicroShift pods in a given namespace started and verify they are not restarting by sampling -# the pod restart count 10 times every 5 seconds and comparing the current sample with the previous one. -# The pods are considered restarting if the number of 'pod-restarting' samples is greater than the -# number of 'pod-not-restarting' ones. -# -# arg1: Name of the namespace to check -# return: 0 if pods are not restarting, or 1 otherwise -function namespace_pods_not_restarting() { - local ns=$1 - local restarts=0 - - local count1=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null) - for i in $(seq 10) ; do - sleep 5 - local countS=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].started}' 2>/dev/null | grep -vc false) - local count2=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null) - - # If pods started, a restart is detected by comparing the count string between the checks. - # The number of pod restarts is incremented when a restart is detected, or decremented otherwise. - if [ "${countS}" -ne 0 ] && [ "${count1}" = "${count2}" ] ; then - restarts=$(( restarts - 1 )) - else - restarts=$(( restarts + 1 )) - count1=${count2} - fi - done - - [ "${restarts}" -lt 0 ] && return 0 - return 1 -} - # # Main # @@ -195,14 +95,8 @@ if [ $(systemctl is-enabled microshift.service 2>/dev/null) != "enabled" ] ; the exit 0 fi -# Update the wait timeout according to the boot counter. -# The new wait timeout is a product of the timeout base and the number of boot attempts. -MAX_BOOT_ATTEMPTS=${GREENBOOT_MAX_BOOT_ATTEMPTS:-3} -BOOT_COUNTER=$(grub2-editenv - list | grep ^boot_counter= | awk -F= '{print $2}') -[ -z "${BOOT_COUNTER}" ] && BOOT_COUNTER=$(( $MAX_BOOT_ATTEMPTS - 1 )) - -WAIT_TIMEOUT_SECS=$(( $WAIT_TIMEOUT_SECS_BASE * ( $MAX_BOOT_ATTEMPTS - $BOOT_COUNTER ) )) -[ ${WAIT_TIMEOUT_SECS} -le 0 ] && WAIT_TIMEOUT_SECS=${WAIT_TIMEOUT_SECS_BASE} +# Set the wait timeout for the current check based on the boot counter +WAIT_TIMEOUT_SECS=$(get_wait_timeout) # Wait for MicroShift service to be active (failed status terminates the script) echo "Waiting ${WAIT_TIMEOUT_SECS}s for MicroShift service to be active and not failed" @@ -218,7 +112,7 @@ wait_for ${WAIT_TIMEOUT_SECS} any_pods_running # Wait for MicroShift core pod images to be downloaded for i in ${!PODS_NS_LIST[@]}; do - CHECK_PODS_NS=${PODS_NS_LIST[$i]} + CHECK_PODS_NS=${PODS_NS_LIST[$i]} echo "Waiting ${WAIT_TIMEOUT_SECS}s for pod image(s) from the '${CHECK_PODS_NS}' namespace to be downloaded" wait_for ${WAIT_TIMEOUT_SECS} namespace_images_downloaded diff --git a/packaging/rpm/microshift.spec b/packaging/rpm/microshift.spec index ffd2f32a52..1664bd10ad 100644 --- a/packaging/rpm/microshift.spec +++ b/packaging/rpm/microshift.spec @@ -208,6 +208,8 @@ install -d -m755 %{buildroot}%{_sysconfdir}/greenboot/check/required.d install -d -m755 %{buildroot}%{_sysconfdir}/greenboot/red.d install -p -m755 packaging/greenboot/microshift-running-check.sh %{buildroot}%{_sysconfdir}/greenboot/check/required.d/40_microshift_running_check.sh install -p -m755 packaging/greenboot/microshift-pre-rollback.sh %{buildroot}%{_sysconfdir}/greenboot/red.d/40_microshift_pre_rollback.sh +install -d -m755 %{buildroot}%{_datadir}/microshift/functions +install -p -m644 packaging/greenboot/functions.sh %{buildroot}%{_datadir}/microshift/functions/greenboot.sh %post @@ -289,6 +291,7 @@ systemctl enable --now --quiet openvswitch || true %files greenboot %{_sysconfdir}/greenboot/check/required.d/40_microshift_running_check.sh %{_sysconfdir}/greenboot/red.d/40_microshift_pre_rollback.sh +%{_datadir}/microshift/functions/greenboot.sh # Use Git command to generate the log and replace the VERSION string # LANG=C git log --date="format:%a %b %d %Y" --pretty="tformat:* %cd %an <%ae> VERSION%n- %s%n" packaging/rpm/microshift.spec