From 66ede7b71e8ba8243ecb92bd794672f6ebb135d3 Mon Sep 17 00:00:00 2001
From: Gregory Giguashvili <ggiguash@redhat.com>
Date: Mon, 6 Mar 2023 18:32:58 +0000
Subject: [PATCH] Add user workload health check docs to Greenboot procedures

---
 docs/config/busybox_running_check.sh          |  68 +++++++++
 docs/greenboot_dev.md                         |  68 ++++++++-
 packaging/greenboot/functions.sh              | 134 ++++++++++++++++++
 .../greenboot/microshift-pre-rollback.sh      |   0
 .../greenboot/microshift-running-check.sh     | 120 +---------------
 packaging/rpm/microshift.spec                 |   3 +
 6 files changed, 275 insertions(+), 118 deletions(-)
 create mode 100644 docs/config/busybox_running_check.sh
 create mode 100644 packaging/greenboot/functions.sh
 mode change 100644 => 100755 packaging/greenboot/microshift-pre-rollback.sh

diff --git a/docs/config/busybox_running_check.sh b/docs/config/busybox_running_check.sh
new file mode 100644
index 0000000000..87b804fb56
--- /dev/null
+++ b/docs/config/busybox_running_check.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -e
+
+SCRIPT_NAME=$(basename $0)
+PODS_NS_LIST=(busybox)
+PODS_CT_LIST=(3      )
+
+# Source the MicroShift health check functions library
+source /usr/share/microshift/functions/greenboot.sh
+
+# Set the exit handler to log the exit status
+trap 'script_exit' EXIT
+
+# The script exit handler logging the FAILURE or FINISHED message depending
+# on the exit status of the last command
+#
+# args: None
+# return: None
+function script_exit() {
+    [ "$?" -ne 0 ] && status=FAILURE || status=FINISHED
+    echo $status
+}
+
+#
+# Main
+#
+
+# Exit if the current user is not 'root'
+if [ $(id -u) -ne 0 ] ; then
+    echo "The '${SCRIPT_NAME}' script must be run with the 'root' user privileges"
+    exit 1
+fi
+
+echo "STARTED"
+
+# Exit if the MicroShift service is not enabled
+if [ $(systemctl is-enabled microshift.service 2>/dev/null) != "enabled" ] ; then
+    echo "MicroShift service is not enabled. Exiting..."
+    exit 0
+fi
+
+# Set the wait timeout for the current check based on the boot counter
+WAIT_TIMEOUT_SECS=$(get_wait_timeout)
+
+# Wait for pod images to be downloaded
+for i in ${!PODS_NS_LIST[@]}; do
+    CHECK_PODS_NS=${PODS_NS_LIST[$i]}
+
+    echo "Waiting ${WAIT_TIMEOUT_SECS}s for pod image(s) from the '${CHECK_PODS_NS}' namespace to be downloaded"
+    wait_for ${WAIT_TIMEOUT_SECS} namespace_images_downloaded
+done
+
+# Wait for pods to enter ready state
+for i in ${!PODS_NS_LIST[@]}; do
+    CHECK_PODS_NS=${PODS_NS_LIST[$i]}
+    CHECK_PODS_CT=${PODS_CT_LIST[$i]}
+
+    echo "Waiting ${WAIT_TIMEOUT_SECS}s for ${CHECK_PODS_CT} pod(s) from the '${CHECK_PODS_NS}' namespace to be in 'Ready' state"
+    wait_for ${WAIT_TIMEOUT_SECS} namespace_pods_ready
+done
+
+# Verify that pods are not restarting
+for i in ${!PODS_NS_LIST[@]}; do
+    CHECK_PODS_NS=${PODS_NS_LIST[$i]}
+
+    echo "Checking pod restart count in the '${CHECK_PODS_NS}' namespace"
+    namespace_pods_not_restarting ${CHECK_PODS_NS}
+done
diff --git a/docs/greenboot_dev.md b/docs/greenboot_dev.md
index 3eda4dc1ad..7dd01b681a 100644
--- a/docs/greenboot_dev.md
+++ b/docs/greenboot_dev.md
@@ -3,10 +3,68 @@
 ## Motivation
 
 [Integrating MicroShift with Greenboot](./greenboot.md) allows for automatic
-software upgrade rollbacks in case of a failure. The current document describes
-a few techniques for simulating software upgrade failures in a development
-environment. These guidelines can be used by developers for implementing CI/CD
-pipelines testing MicroShift integration with Greenboot.
+software upgrade rollbacks in case of a failure.
+
+The current document describes a few techniques for:
+* Adding user workload health check procedures in a production environment
+* Simulating software upgrade failures in a development environment
+
+These guidelines can be used by developers for implementing user workload
+health check using Greenboot facilities, as well as simulating failures for
+testing MicroShift integration with Greenboot in CI/CD pipelines.
+
+## User Workload Health
+
+### Installation
+
+Follow the instructions in [Auto-applying Manifests](./howto_config.md#auto-applying-manifests)
+section to install a dummy user workload, without restarting the MicroShift service
+at this time.
+
+Proceed by creating a health check script in the `/etc/greenboot/check/required.d`
+directory.
+> The name prefix of the user script should be chosen to make sure it runs after
+> the `40_microshift_running_check.sh` script, which implements the MicroShift
+> health check procedure for its core services.
+
+```
+SCRIPT_FILE=/etc/greenboot/check/required.d/50_busybox_running_check.sh
+sudo curl -s https://raw.githubusercontent.com/openshift/microshift/main/docs/config/busybox_running_check.sh \
+  -o ${SCRIPT_FILE} && echo SUCCESS || echo ERROR
+sudo chmod 755 ${SCRIPT_FILE}
+```
+
+### Testing
+
+Reboot the system and run the following command to examine the output of the
+Greenboot health checks. Note that the MicroShift core service health checks
+are running before the user workload health checks.
+
+```bash
+sudo journalctl -o cat -u greenboot-healthcheck.service
+```
+
+### Health Check Implementation
+
+The script utilizes the MicroShift health check functions that are available
+in the `/usr/share/microshift/functions/greenboot.sh` file to reuse procedures
+already implemented for the MicroShift core services. These functions need a
+definition of the user workload namespaces and the expected count of pods.
+
+```bash
+PODS_NS_LIST=(busybox)
+PODS_CT_LIST=(3      )
+```
+
+The script starts by running sanity checks to verify that it is executed from
+the `root` account and that the MicroShift service is enabled.
+
+Finally, the MicroShift health check functions are called to perform the
+following actions:
+- Get a wait timeout of the current boot cycle for the `wait_for` function
+- Call the `namespace_images_downloaded` function to wait until pod images are available
+- Call the `namespace_pods_ready` function to wait until pods are ready
+- Call the `namespace_pods_not_restarting` function to verify pods are not restarting
 
 ## MicroShift Service Failure
 
@@ -69,7 +127,7 @@ sudo rpm-ostree cleanup -b -r
 
 ## MicroShift Pod Failure
 
-To simulate a situation with the MicroShift pod failure after an upgrade, 
+To simulate a situation with the MicroShift pod failure after an upgrade,
 one can set the `network.serviceNetwork` MicroShift configuration option to a
 non-default `10.66.0.0/16` value without resetting the MicroShift data at the
 `/var/lib/microshift` directory.
diff --git a/packaging/greenboot/functions.sh b/packaging/greenboot/functions.sh
new file mode 100644
index 0000000000..5fde8d1650
--- /dev/null
+++ b/packaging/greenboot/functions.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+#
+# Functions used by MicroShift in Greenboot health check procedures.
+# This library may also be used for user workload health check verification.
+#
+SCRIPT_PID=$$
+
+OCCONFIG_OPT="--kubeconfig /var/lib/microshift/resources/kubeadmin/kubeconfig"
+OCGET_OPT="--no-headers"
+OCGET_CMD="oc get ${OCCONFIG_OPT}"
+
+# Get the recommended wait timeout to be used for running health check operations.
+# The returned timeout is a product of a base value and a boot attempt counter, so
+# that the timeout increases after every boot attempt.
+#
+# The base value for the timeout and the maximum boot attempts can be defined in
+# the /etc/greenboot/greenboot.conf file using the MICROSHIFT_WAIT_TIMEOUT_SEC
+# and GREENBOOT_MAX_BOOTS settings.
+#
+# args: None
+# return: Print the recommended timeout value to stdout
+function get_wait_timeout() {
+    # Source Greenboot configuration file if it exists
+    local conf_file=/etc/greenboot/greenboot.conf
+    [ -f "${conf_file}" ] && source ${conf_file}
+    local base_timeout=${MICROSHIFT_WAIT_TIMEOUT_SEC:-300}
+
+    # Update the wait timeout according to the boot counter.
+    # The new wait timeout is a product of the timeout base and the number of boot attempts.
+    local max_boots=${GREENBOOT_MAX_BOOTS:-3}
+    local boot_counter=$(grub2-editenv - list | grep ^boot_counter= | awk -F= '{print $2}')
+    [ -z "${boot_counter}" ] && boot_counter=$(( $max_boots - 1 ))
+
+    local wait_timeout=$(( $base_timeout * ( $max_boots - $boot_counter ) ))
+    [ ${wait_timeout} -le 0 ] && wait_timeout=${base_timeout}
+
+    echo $wait_timeout
+}
+
+# Run a command with a second delay until it returns a zero exit status
+#
+# arg1: Time in seconds to wait for a command to succeed
+# argN: Command to run with optional arguments
+# return: 0 if a command ran successfully within the wait period, or 1 otherwise
+function wait_for() {
+    local timeout=$1
+    shift 1
+
+    local start=$(date +%s)
+    until ("$@"); do
+        sleep 1
+
+        local now=$(date +%s)
+        [ $(( now - start )) -ge $timeout ] && return 1
+    done
+
+    return 0
+}
+
+# Check if all the pod images in a given namespace are downloaded.
+#
+# args: None
+# env1: 'CHECK_PODS_NS' environment variable for the namespace to check
+# return: 0 if all the images in a given namespace are downloaded, or 1 otherwise
+function namespace_images_downloaded() {
+    local ns=${CHECK_PODS_NS}
+
+    local images=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o jsonpath="{.items[*].spec.containers[*].image}" 2>/dev/null)
+    for i in ${images} ; do
+        # Return an error on the first missing image
+        local cimage=$(crictl image -q ${i})
+        [ -z "${cimage}" ] && return 1
+    done
+
+    return 0
+}
+
+# Check if a given number of pods in a given namespace are in the 'Ready' status,
+# terminating the script with the SIGTERM signal if more pods are ready than expected.
+#
+# args: None
+# env1: 'CHECK_PODS_NS' environment variable for the namespace to check
+# env2: 'CHECK_PODS_CT' environment variable for the pod count to check
+# return: 0 if the expected number of pods are ready, or 1 otherwise
+function namespace_pods_ready() {
+    local ns=${CHECK_PODS_NS}
+    local ct=${CHECK_PODS_CT}
+
+    local status=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
+    local tcount=$(echo $status | grep -o True  | wc -l)
+    local fcount=$(echo $status | grep -o False | wc -l)
+
+    # Terminate the script in case more pods are ready than expected - nothing to wait for
+    if [ "${tcount}" -gt "${ct}" ] ; then
+        echo "The number of ready pods in the '${ns}' namespace is greater than the expected '${ct}' count. Terminating..."
+        kill -TERM ${SCRIPT_PID}
+    fi
+    # Exit with error if any pods are not ready yet
+    [ "${fcount}" -gt 0 ] && return 1
+    # Check the ready pod count
+    [ "${tcount}" -eq "${ct}" ] && return 0
+    return 1
+}
+
+# Check if MicroShift pods in a given namespace started and verify they are not restarting by sampling
+# the pod restart count 10 times every 5 seconds and comparing the current sample with the previous one.
+# The pods are considered restarting if the number of 'pod-restarting' samples is greater than the
+# number of 'pod-not-restarting' ones.
+#
+# arg1: Name of the namespace to check
+# return: 0 if pods are not restarting, or 1 otherwise
+function namespace_pods_not_restarting() {
+    local ns=$1
+    local restarts=0
+
+    local count1=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null)
+    for i in $(seq 10) ; do
+        sleep 5
+        local countS=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].started}' 2>/dev/null | grep -vc false)
+        local count2=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null)
+
+        # If pods started, a restart is detected by comparing the count string between the checks.
+        # The number of pod restarts is incremented when a restart is detected, or decremented otherwise.
+        if [ "${countS}" -ne 0 ] && [ "${count1}" = "${count2}" ] ; then
+            restarts=$(( restarts - 1 ))
+        else
+            restarts=$(( restarts + 1 ))
+            count1=${count2}
+        fi
+    done
+
+    [ "${restarts}" -lt 0 ] && return 0
+    return 1
+}
diff --git a/packaging/greenboot/microshift-pre-rollback.sh b/packaging/greenboot/microshift-pre-rollback.sh
old mode 100644
new mode 100755
diff --git a/packaging/greenboot/microshift-running-check.sh b/packaging/greenboot/microshift-running-check.sh
index a0eba76943..1aa1978418 100755
--- a/packaging/greenboot/microshift-running-check.sh
+++ b/packaging/greenboot/microshift-running-check.sh
@@ -3,15 +3,11 @@ set -e
 
 SCRIPT_NAME=$(basename $0)
 SCRIPT_PID=$$
-OCGET_CMD="oc get --kubeconfig /var/lib/microshift/resources/kubeadmin/kubeconfig"
-OCGET_OPT="--no-headers"
 PODS_NS_LIST=(openshift-ovn-kubernetes openshift-service-ca openshift-ingress openshift-dns openshift-storage)
 PODS_CT_LIST=(2                        1                    1                 2             2)
 
-# Source Greenboot configuration file if it exists
-GREENBOOT_CONF_FILE=/etc/greenboot/greenboot.conf
-[ -f "${GREENBOOT_CONF_FILE}" ] && source ${GREENBOOT_CONF_FILE}
-WAIT_TIMEOUT_SECS_BASE=${MICROSHIFT_WAIT_TIMEOUT_SEC:-300}
+# Source the MicroShift health check functions library
+source /usr/share/microshift/functions/greenboot.sh
 
 # Set the exit handler to log the exit status
 trap 'script_exit' EXIT
@@ -26,26 +22,6 @@ function script_exit() {
     echo $status
 }
 
-# Run a command with a second delay until it returns a zero exit status
-#
-# arg1: Time in seconds to wait for a command to succeed
-# argN: Command to run with optional arguments
-# return: 0 if a command ran successfully within the wait period, or 1 otherwise
-function wait_for() {
-    local timeout=$1
-    shift 1
-
-    local start=$(date +%s)
-    until ("$@"); do
-        sleep 1
-        
-        local now=$(date +%s)
-        [ $(( now - start )) -ge $timeout ] && return 1
-    done
-
-    return 0
-}
-
 # Check the microshift.service systemd unit activity, terminating the script
 # with the SIGTERM signal if the unit reports a failed state
 #
@@ -66,7 +42,7 @@ function microshift_service_active() {
 }
 
 # Check if MicroShift API 'readyz' and 'livez' health endpoints are OK
-# 
+#
 # args: None
 # return: 0 if all API health endpoints are OK, or 1 otherwise
 function microshift_health_endpoints_ok() {
@@ -79,7 +55,7 @@ function microshift_health_endpoints_ok() {
 }
 
 # Check if any MicroShift pods are in the 'Running' status
-# 
+#
 # args: None
 # return: 0 if any pods are in the 'Running' status, or 1 otherwise
 function any_pods_running() {
@@ -89,82 +65,6 @@ function any_pods_running() {
     return 1
 }
 
-# Check if all the MicroShift pod images in a given namespace are downloaded.
-#
-# args: None
-# env1: 'CHECK_PODS_NS' environment variable for the namespace to check
-# return: 0 if all the images in a given namespace are downloaded, or 1 otherwise
-function namespace_images_downloaded() {
-    local ns=${CHECK_PODS_NS}
-
-    local images=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o jsonpath="{.items[*].spec.containers[*].image}" 2>/dev/null)
-    for i in ${images} ; do
-        # Return an error on the first missing image
-        local cimage=$(crictl image -q ${i})
-        [ -z "${cimage}" ] && return 1
-    done
-
-    return 0
-}
-
-# Check if a given number of MicroShift pods in a given namespace are in the 'Ready' status,
-# terminating the script with the SIGTERM signal if more pods are ready than expected.
-#
-# args: None
-# env1: 'CHECK_PODS_NS' environment variable for the namespace to check
-# env2: 'CHECK_PODS_CT' environment variable for the pod count to check
-# return: 0 if the expected number of pods are ready, or 1 otherwise
-function namespace_pods_ready() {
-    local ns=${CHECK_PODS_NS}
-    local ct=${CHECK_PODS_CT}
-
-    local status=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
-    local tcount=$(echo $status | grep -o True  | wc -l)
-    local fcount=$(echo $status | grep -o False | wc -l)
-
-    # Terminate the script in case more pods are ready than expected - nothing to wait for
-    if [ "${tcount}" -gt "${ct}" ] ; then
-        echo "The number of ready pods in the '${ns}' namespace is greater than the expected '${ct}' count. Terminating..."
-        kill -TERM ${SCRIPT_PID}
-    fi
-    # Exit with error if any pods are not ready yet
-    [ "${fcount}" -gt 0 ] && return 1
-    # Check the ready pod count    
-    [ "${tcount}" -eq "${ct}" ] && return 0
-    return 1
-}
-
-# Check if MicroShift pods in a given namespace started and verify they are not restarting by sampling
-# the pod restart count 10 times every 5 seconds and comparing the current sample with the previous one.
-# The pods are considered restarting if the number of 'pod-restarting' samples is greater than the
-# number of 'pod-not-restarting' ones.
-#
-# arg1: Name of the namespace to check
-# return: 0 if pods are not restarting, or 1 otherwise
-function namespace_pods_not_restarting() {
-    local ns=$1
-    local restarts=0
-
-    local count1=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null)
-    for i in $(seq 10) ; do
-        sleep 5
-        local countS=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].started}' 2>/dev/null | grep -vc false)
-        local count2=$(${OCGET_CMD} pods ${OCGET_OPT} -n ${ns} -o 'jsonpath={..status.containerStatuses[].restartCount}' 2>/dev/null)
-
-        # If pods started, a restart is detected by comparing the count string between the checks.
-        # The number of pod restarts is incremented when a restart is detected, or decremented otherwise.
-        if [ "${countS}" -ne 0 ] && [ "${count1}" = "${count2}" ] ; then
-            restarts=$(( restarts - 1 ))
-        else
-            restarts=$(( restarts + 1 ))
-            count1=${count2}
-        fi
-    done
-
-    [ "${restarts}" -lt 0 ] && return 0
-    return 1
-}
-
 #
 # Main
 #
@@ -195,14 +95,8 @@ if [ $(systemctl is-enabled microshift.service 2>/dev/null) != "enabled" ] ; the
     exit 0
 fi
 
-# Update the wait timeout according to the boot counter.
-# The new wait timeout is a product of the timeout base and the number of boot attempts.
-MAX_BOOT_ATTEMPTS=${GREENBOOT_MAX_BOOT_ATTEMPTS:-3}
-BOOT_COUNTER=$(grub2-editenv - list | grep ^boot_counter= | awk -F= '{print $2}')
-[ -z "${BOOT_COUNTER}" ] && BOOT_COUNTER=$(( $MAX_BOOT_ATTEMPTS - 1 ))
-
-WAIT_TIMEOUT_SECS=$(( $WAIT_TIMEOUT_SECS_BASE * ( $MAX_BOOT_ATTEMPTS - $BOOT_COUNTER ) ))
-[ ${WAIT_TIMEOUT_SECS} -le 0 ] && WAIT_TIMEOUT_SECS=${WAIT_TIMEOUT_SECS_BASE}
+# Set the wait timeout for the current check based on the boot counter
+WAIT_TIMEOUT_SECS=$(get_wait_timeout)
 
 # Wait for MicroShift service to be active (failed status terminates the script)
 echo "Waiting ${WAIT_TIMEOUT_SECS}s for MicroShift service to be active and not failed"
@@ -218,7 +112,7 @@ wait_for ${WAIT_TIMEOUT_SECS} any_pods_running
 
 # Wait for MicroShift core pod images to be downloaded
 for i in ${!PODS_NS_LIST[@]}; do
-    CHECK_PODS_NS=${PODS_NS_LIST[$i]}    
+    CHECK_PODS_NS=${PODS_NS_LIST[$i]}
 
     echo "Waiting ${WAIT_TIMEOUT_SECS}s for pod image(s) from the '${CHECK_PODS_NS}' namespace to be downloaded"
     wait_for ${WAIT_TIMEOUT_SECS} namespace_images_downloaded
diff --git a/packaging/rpm/microshift.spec b/packaging/rpm/microshift.spec
index ffd2f32a52..1664bd10ad 100644
--- a/packaging/rpm/microshift.spec
+++ b/packaging/rpm/microshift.spec
@@ -208,6 +208,8 @@ install -d -m755 %{buildroot}%{_sysconfdir}/greenboot/check/required.d
 install -d -m755 %{buildroot}%{_sysconfdir}/greenboot/red.d
 install -p -m755 packaging/greenboot/microshift-running-check.sh %{buildroot}%{_sysconfdir}/greenboot/check/required.d/40_microshift_running_check.sh
 install -p -m755 packaging/greenboot/microshift-pre-rollback.sh %{buildroot}%{_sysconfdir}/greenboot/red.d/40_microshift_pre_rollback.sh
+install -d -m755 %{buildroot}%{_datadir}/microshift/functions
+install -p -m644 packaging/greenboot/functions.sh %{buildroot}%{_datadir}/microshift/functions/greenboot.sh
 
 %post
 
@@ -289,6 +291,7 @@ systemctl enable --now --quiet openvswitch || true
 %files greenboot
 %{_sysconfdir}/greenboot/check/required.d/40_microshift_running_check.sh
 %{_sysconfdir}/greenboot/red.d/40_microshift_pre_rollback.sh
+%{_datadir}/microshift/functions/greenboot.sh
 
 # Use Git command to generate the log and replace the VERSION string
 # LANG=C git log --date="format:%a %b %d %Y" --pretty="tformat:* %cd %an <%ae> VERSION%n- %s%n" packaging/rpm/microshift.spec