openshift · openshift-merge-robot · Mar 7, 2023 · Mar 6, 2023
diff --git a/docs/config/busybox_running_check.sh b/docs/config/busybox_running_check.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -e
+
+SCRIPT_NAME=$(basename $0)
+PODS_NS_LIST=(busybox)
+PODS_CT_LIST=(3      )
+
+# Source the MicroShift health check functions library
+source /usr/share/microshift/functions/greenboot.sh
+
+# Set the exit handler to log the exit status
+trap 'script_exit' EXIT
+
+# The script exit handler logging the FAILURE or FINISHED message depending
+# on the exit status of the last command
+#
+# args: None
+# return: None
+function script_exit() {
+    [ "$?" -ne 0 ] && status=FAILURE || status=FINISHED
+    echo $status
+}
+
+#
+# Main
+#
+
+# Exit if the current user is not 'root'
+if [ $(id -u) -ne 0 ] ; then
+    echo "The '${SCRIPT_NAME}' script must be run with the 'root' user privileges"
+    exit 1
+fi
+
+echo "STARTED"
+
+# Exit if the MicroShift service is not enabled
+if [ $(systemctl is-enabled microshift.service 2>/dev/null) != "enabled" ] ; then
+    echo "MicroShift service is not enabled. Exiting..."
+    exit 0
+fi
+
+# Set the wait timeout for the current check based on the boot counter
+WAIT_TIMEOUT_SECS=$(get_wait_timeout)
+
+# Wait for pod images to be downloaded
+for i in ${!PODS_NS_LIST[@]}; do
+    CHECK_PODS_NS=${PODS_NS_LIST[$i]}
+
+    echo "Waiting ${WAIT_TIMEOUT_SECS}s for pod image(s) from the '${CHECK_PODS_NS}' namespace to be downloaded"
+    wait_for ${WAIT_TIMEOUT_SECS} namespace_images_downloaded
+done
+
+# Wait for pods to enter ready state
+for i in ${!PODS_NS_LIST[@]}; do
+    CHECK_PODS_NS=${PODS_NS_LIST[$i]}
+    CHECK_PODS_CT=${PODS_CT_LIST[$i]}
+
+    echo "Waiting ${WAIT_TIMEOUT_SECS}s for ${CHECK_PODS_CT} pod(s) from the '${CHECK_PODS_NS}' namespace to be in 'Ready' state"
+    wait_for ${WAIT_TIMEOUT_SECS} namespace_pods_ready
+done
+
+# Verify that pods are not restarting
+for i in ${!PODS_NS_LIST[@]}; do
+    CHECK_PODS_NS=${PODS_NS_LIST[$i]}
+
+    echo "Checking pod restart count in the '${CHECK_PODS_NS}' namespace"
+    namespace_pods_not_restarting ${CHECK_PODS_NS}
+done
diff --git a/docs/greenboot_dev.md b/docs/greenboot_dev.md
@@ -3,10 +3,68 @@
 ## Motivation
 
 [Integrating MicroShift with Greenboot](./greenboot.md) allows for automatic
-software upgrade rollbacks in case of a failure. The current document describes
-a few techniques for simulating software upgrade failures in a development
-environment. These guidelines can be used by developers for implementing CI/CD
-pipelines testing MicroShift integration with Greenboot.
+software upgrade rollbacks in case of a failure.
+
+The current document describes a few techniques for:
+* Adding user workload health check procedures in a production environment
+* Simulating software upgrade failures in a development environment
+
+These guidelines can be used by developers for implementing user workload
+health check using Greenboot facilities, as well as simulating failures for
+testing MicroShift integration with Greenboot in CI/CD pipelines.
+
+## User Workload Health
+
+### Installation
+
+Follow the instructions in [Auto-applying Manifests](./howto_config.md#auto-applying-manifests)
+section to install a dummy user workload, without restarting the MicroShift service
+at this time.
+
+Proceed by creating a health check script in the `/etc/greenboot/check/required.d`
+directory.
+> The name prefix of the user script should be chosen to make sure it runs after
+> the `40_microshift_running_check.sh` script, which implements the MicroShift
+> health check procedure for its core services.
+
+```
+SCRIPT_FILE=/etc/greenboot/check/required.d/50_busybox_running_check.sh
+sudo curl -s https://raw.githubusercontent.com/openshift/microshift/main/docs/config/busybox_running_check.sh \
+  -o ${SCRIPT_FILE} && echo SUCCESS || echo ERROR
+sudo chmod 755 ${SCRIPT_FILE}
+```
+
+### Testing
+
+Reboot the system and run the following command to examine the output of the
+Greenboot health checks. Note that the MicroShift core service health checks
+are running before the user workload health checks.
+
+```bash
+sudo journalctl -o cat -u greenboot-healthcheck.service
+```
+
+### Health Check Implementation
+
+The script utilizes the MicroShift health check functions that are available
+in the `/usr/share/microshift/functions/greenboot.sh` file to reuse procedures
+already implemented for the MicroShift core services. These functions need a
+definition of the user workload namespaces and the expected count of pods.
+
+```bash
+PODS_NS_LIST=(busybox)
+PODS_CT_LIST=(3      )
+```
+
+The script starts by running sanity checks to verify that it is executed from
+the `root` account and that the MicroShift service is enabled.
+
+Finally, the MicroShift health check functions are called to perform the
+following actions:
+- Get a wait timeout of the current boot cycle for the `wait_for` function
+- Call the `namespace_images_downloaded` function to wait until pod images are available
+- Call the `namespace_pods_ready` function to wait until pods are ready
+- Call the `namespace_pods_not_restarting` function to verify pods are not restarting
 
 ## MicroShift Service Failure
 
@@ -69,7 +127,7 @@ sudo rpm-ostree cleanup -b -r
 
 ## MicroShift Pod Failure
 
-To simulate a situation with the MicroShift pod failure after an upgrade, 
+To simulate a situation with the MicroShift pod failure after an upgrade,
 one can set the `network.serviceNetwork` MicroShift configuration option to a
 non-default `10.66.0.0/16` value without resetting the MicroShift data at the
 `/var/lib/microshift` directory.

diff --git a/packaging/greenboot/functions.sh b/packaging/greenboot/functions.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+#
+# Functions used by MicroShift in Greenboot health check procedures.
+# This library may also be used for user workload health check verification.
+#
+SCRIPT_PID=$$
+
+OCCONFIG_OPT="--kubeconfig /var/lib/microshift/resources/kubeadmin/kubeconfig"
+OCGET_OPT="--no-headers"
+OCGET_CMD="oc get ${OCCONFIG_OPT}"
+
+# Get the recommended wait timeout to be used for running health check operations.
+# The returned timeout is a product of a base value and a boot attempt counter, so
+# that the timeout increases after every boot attempt.
+#
+# The base value for the timeout and the maximum boot attempts can be defined in
+# the /etc/greenboot/greenboot.conf file using the MICROSHIFT_WAIT_TIMEOUT_SEC
+# and GREENBOOT_MAX_BOOTS settings.
+#
+# args: None
+# return: Print the recommended timeout value to stdout
+function get_wait_timeout() {
+    # Source Greenboot configuration file if it exists
+    local conf_file=/etc/greenboot/greenboot.conf
+    [ -f "${conf_file}" ] && source ${conf_file}
+    local base_timeout=${MICROSHIFT_WAIT_TIMEOUT_SEC:-300}
+
+    # Update the wait timeout according to the boot counter.
+    # The new wait timeout is a product of the timeout base and the number of boot attempts.
+    local max_boots=${GREENBOOT_MAX_BOOTS:-3}
+    local boot_counter=$(grub2-editenv - list | grep ^boot_counter= | awk -F= '{print $2}')
+    [ -z "${boot_counter}" ] && boot_counter=$(( $max_boots - 1 ))
+
+    local wait_timeout=$(( $base_timeout * ( $max_boots - $boot_counter ) ))
+    [ ${wait_timeout} -le 0 ] && wait_timeout=${base_timeout}
+
+    echo $wait_timeout
+}
+
+# Run a command with a second delay until it returns a zero exit status
+#
+# arg1: Time in seconds to wait for a command to succeed
+# argN: Command to run with optional arguments
+# return: 0 if a command ran successfully within the wait period, or 1 otherwise
+function wait_for() {
+    local timeout=$1
+    shift 1
+
+    local start=$(date +%s)
+    until ("$@"); do
+        sleep 1
+
+        local now=$(date +%s)
+        [ $(( now - start )) -ge $timeout ] && return 1
+    done
+
+    return 0
+}
+
+# Check if all the pod images in a given namespace are downloaded.
+#
+# args: None
+# env1: 'CHECK_PODS_NS' environment variable for the namespace to check
+# return: 0 if all the images in a given namespace are downloaded, or 1 otherwise
+function namespace_images_downloaded() {
+    local ns=${CHECK_PODS_NS}
+
+    local images=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o jsonpath="{.items[*].spec.containers[*].image}" 2>/dev/null)
+    for i in ${images} ; do
+        # Return an error on the first missing image
+        local cimage=$(crictl image -q ${i})
+        [ -z "${cimage}" ] && return 1
+    done
+
+    return 0
+}
+
+# Check if a given number of pods in a given namespace are in the 'Ready' status,
+# terminating the script with the SIGTERM signal if more pods are ready than expected.
+#
+# args: None
+# env1: 'CHECK_PODS_NS' environment variable for the namespace to check
+# env2: 'CHECK_PODS_CT' environment variable for the pod count to check
+# return: 0 if the expected number of pods are ready, or 1 otherwise
+function namespace_pods_ready() {
+    local ns=${CHECK_PODS_NS}
+    local ct=${CHECK_PODS_CT}
+
+    local status=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
+    local tcount=$(echo $status | grep -o True  | wc -l)
+    local fcount=$(echo $status | grep -o False | wc -l)
+
+    # Terminate the script in case more pods are ready than expected - nothing to wait for
+    if [ "${tcount}" -gt "${ct}" ] ; then
+        echo "The number of ready pods in the '${ns}' namespace is greater than the expected '${ct}' count. Terminating..."
+        kill -TERM ${SCRIPT_PID}
+    fi
+    # Exit with error if any pods are not ready yet
+    [ "${fcount}" -gt 0 ] && return 1
+    # Check the ready pod count
+    [ "${tcount}" -eq "${ct}" ] && return 0
+    return 1
+}
+
+# Check if MicroShift pods in a given namespace started and verify they are not restarting by sampling
+# the pod restart count 10 times every 5 seconds and comparing the current sample with the previous one.
+# The pods are considered restarting if the number of 'pod-restarting' samples is greater than the
+# number of 'pod-not-restarting' ones.
+#
+# arg1: Name of the namespace to check
+# return: 0 if pods are not restarting, or 1 otherwise
+function namespace_pods_not_restarting() {
+    local ns=$1
+    local restarts=0
+
+    local count1=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null)
+    for i in $(seq 10) ; do
+        sleep 5
+        local countS=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].started}' 2>/dev/null | grep -vc false)
+        local count2=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null)
+
+        # If pods started, a restart is detected by comparing the count string between the checks.
+        # The number of pod restarts is incremented when a restart is detected, or decremented otherwise.
+        if [ "${countS}" -ne 0 ] && [ "${count1}" = "${count2}" ] ; then
+            restarts=$(( restarts - 1 ))
+        else
+            restarts=$(( restarts + 1 ))
+            count1=${count2}
+        fi
+    done
+
+    [ "${restarts}" -lt 0 ] && return 0
+    return 1
+}
diff --git a/packaging/greenboot/microshift-pre-rollback.sh b/packaging/greenboot/microshift-pre-rollback.sh