diff --git a/data/data/bootstrap/bootstrap-in-place/files/usr/local/bin/bootstrap-in-place.sh b/data/data/bootstrap/bootstrap-in-place/files/usr/local/bin/bootstrap-in-place.sh index 2661c40edb4..00e875c0fd3 100755 --- a/data/data/bootstrap/bootstrap-in-place/files/usr/local/bin/bootstrap-in-place.sh +++ b/data/data/bootstrap/bootstrap-in-place/files/usr/local/bin/bootstrap-in-place.sh @@ -12,6 +12,7 @@ bootkube_podman_run() { } if [ ! -f stop-etcd.done ]; then + record_service_stage_start "stop-etcd" echo "Stop etcd static pod by moving the manifest" mv /etc/kubernetes/manifests/etcd-member-pod.yaml /etc/kubernetes || echo "already moved etcd-member-pod.yaml" @@ -21,9 +22,11 @@ if [ ! -f stop-etcd.done ]; then done touch stop-etcd.done + record_service_stage_success fi if [ ! -f master-ignition.done ]; then + record_service_stage_start "master-ignition" echo "Creating master ignition and writing it to disk" # Get the master ignition from MCS curl --header 'Accept:application/vnd.coreos.ignition+json;version=3.2.0' \ @@ -50,4 +53,5 @@ if [ ! -f master-ignition.done ]; then --output /assets/master.ign touch master-ignition.done + record_service_stage_success fi diff --git a/data/data/bootstrap/bootstrap-in-place/files/usr/local/bin/install-to-disk.sh.template b/data/data/bootstrap/bootstrap-in-place/files/usr/local/bin/install-to-disk.sh.template index 7d6f8c88ea9..6996e593add 100644 --- a/data/data/bootstrap/bootstrap-in-place/files/usr/local/bin/install-to-disk.sh.template +++ b/data/data/bootstrap/bootstrap-in-place/files/usr/local/bin/install-to-disk.sh.template @@ -3,20 +3,28 @@ set -euoE pipefail ## -E option will cause functions to inherit trap # This script is executed by install-to-disk service when installing single node with bootstrap in place +. /usr/local/bin/bootstrap-service-record.sh + +record_service_stage_start "wait-for-bootkube" echo "Waiting for /opt/openshift/.bootkube.done" until [ -f /opt/openshift/.bootkube.done ]; do sleep 5 done +record_service_stage_success if [ ! -f coreos-installer.done ]; then + record_service_stage_start "coreos-installer" # Write image + ignition to disk echo "Executing coreos-installer with the following options: install -i /opt/openshift/master.ign {{.BootstrapInPlace.InstallationDisk}}" coreos-installer install -i /opt/openshift/master.ign {{.BootstrapInPlace.InstallationDisk}} touch coreos-installer.done + record_service_stage_success fi +record_service_stage_start "reboot" echo "Going to reboot" shutdown -r +1 "Bootstrap completed, server is going to reboot." touch /opt/openshift/.install-to-disk.done echo "Done" +record_service_stage_success diff --git a/data/data/bootstrap/files/usr/local/bin/approve-csr.sh b/data/data/bootstrap/files/usr/local/bin/approve-csr.sh index f0018e7e476..2372d39f4b3 100644 --- a/data/data/bootstrap/files/usr/local/bin/approve-csr.sh +++ b/data/data/bootstrap/files/usr/local/bin/approve-csr.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +# shellcheck disable=SC1091 # using path on bootstrap machine +. /usr/local/bin/bootstrap-service-record.sh + KUBECONFIG="${1}" echo "Approving all CSR requests until bootstrapping is complete..." diff --git a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template index cb6df518e1b..84c707fe304 100755 --- a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template @@ -1,6 +1,8 @@ #!/usr/bin/env bash set -euoE pipefail ## -E option will cause functions to inherit trap +. /usr/local/bin/bootstrap-service-record.sh + . /usr/local/bin/release-image.sh mkdir --parents /etc/kubernetes/{manifests,bootstrap-configs,bootstrap-manifests} @@ -68,13 +70,16 @@ mkdir --parents ./{bootstrap-manifests,manifests} if [ ! -f openshift-manifests.done ] then + record_service_stage_start "openshift-manifests" echo "Moving OpenShift manifests in with the rest of them" cp openshift/* manifests/ touch openshift-manifests.done + record_service_stage_success fi if [ ! -f cvo-bootstrap.done ] then + record_service_stage_start "cvo-bootstrap" echo "Rendering Cluster Version Operator Manifests..." rm --recursive --force cvo-bootstrap @@ -95,11 +100,13 @@ then cp auth/kubeconfig-loopback /etc/kubernetes/kubeconfig touch cvo-bootstrap.done + record_service_stage_success fi ETCD_ENDPOINTS=https://localhost:2379 if [ ! -f etcd-bootstrap.done ] then + record_service_stage_start "etcd-bootstrap" echo "Rendering CEO Manifests..." bootkube_podman_run \ --volume "$PWD:/assets:z" \ @@ -131,10 +138,12 @@ then fi touch etcd-bootstrap.done + record_service_stage_success fi if [ ! -f config-bootstrap.done ] then + record_service_stage_start "config-bootstrap" echo "Rendering cluster config manifests..." rm --recursive --force config-bootstrap @@ -158,10 +167,12 @@ then cp config-bootstrap/manifests/* manifests/ touch config-bootstrap.done + record_service_stage_success fi if [ ! -f kube-apiserver-bootstrap.done ] then + record_service_stage_start "kube-apiserver-bootstrap" echo "Rendering Kubernetes API server core manifests..." rm --recursive --force kube-apiserver-bootstrap @@ -185,10 +196,12 @@ then cp kube-apiserver-bootstrap/manifests/* manifests/ touch kube-apiserver-bootstrap.done + record_service_stage_success fi if [ ! -f kube-controller-manager-bootstrap.done ] then + record_service_stage_start "kube-controller-manager-bootstrap" echo "Rendering Kubernetes Controller Manager core manifests..." rm --recursive --force kube-controller-manager-bootstrap @@ -211,10 +224,12 @@ then cp kube-controller-manager-bootstrap/manifests/* manifests/ touch kube-controller-manager-bootstrap.done + record_service_stage_success fi if [ ! -f kube-scheduler-bootstrap.done ] then + record_service_stage_start "kube-scheduler-bootstrap" echo "Rendering Kubernetes Scheduler core manifests..." rm --recursive --force kube-scheduler-bootstrap @@ -233,10 +248,12 @@ then cp kube-scheduler-bootstrap/manifests/* manifests/ touch kube-scheduler-bootstrap.done + record_service_stage_success fi if [ ! -f ingress-operator-bootstrap.done ] then + record_service_stage_start "ingress-operator-bootstrap" echo "Rendering Ingress Operator core manifests..." rm --recursive --force ingress-operator-bootstrap @@ -251,10 +268,12 @@ then cp ingress-operator-manifests/* manifests/ touch ingress-operator-bootstrap.done + record_service_stage_success fi if [ ! -f mco-bootstrap.done ] then + record_service_stage_start "mco-bootstrap" echo "Rendering MCO manifests..." rm --recursive --force mco-bootstrap @@ -323,10 +342,12 @@ then cp tls/machine-config-server.key /etc/ssl/mcs/tls.key touch mco-bootstrap.done + record_service_stage_success fi if [ ! -f cco-bootstrap.done ] then + record_service_stage_start "cco-bootstrap" echo "Rendering CCO manifests..." rm --recursive --force cco-bootstrap @@ -349,13 +370,16 @@ then fi touch cco-bootstrap.done + record_service_stage_success fi # in case of single node, if we removed etcd, there is no point to wait for it on restart if [ ! -f stop-etcd.done ] then + record_service_stage_start "wait-for-etcd" # Wait for the etcd cluster to come up. wait_for_etcd_cluster + record_service_stage_success fi REQUIRED_PODS="openshift-kube-apiserver/kube-apiserver,openshift-kube-scheduler/openshift-kube-scheduler,openshift-kube-controller-manager/kube-controller-manager,openshift-cluster-version/cluster-version-operator" @@ -368,6 +392,7 @@ echo "Starting cluster-bootstrap..." if [ ! -f cb-bootstrap.done ] then + record_service_stage_start "cb-bootstrap" bootkube_podman_run \ --rm \ --volume "$PWD:/assets:z" \ @@ -376,6 +401,7 @@ then start --tear-down-early=false --asset-dir=/assets --required-pods="${REQUIRED_PODS}" touch cb-bootstrap.done + record_service_stage_success fi if [ "$BOOTSTRAP_INPLACE" = true ] @@ -386,6 +412,7 @@ else if [ ! -z "$CLUSTER_ETCD_OPERATOR_IMAGE" ] then + record_service_stage_start "wait-for-ceo" echo "Waiting for CEO to finish..." bootkube_podman_run \ --volume "$PWD:/assets:z" \ @@ -393,8 +420,10 @@ else /usr/bin/cluster-etcd-operator \ wait-for-ceo \ --kubeconfig /assets/auth/kubeconfig + record_service_stage_success fi fi + # Workaround for https://github.com/opencontainers/runc/pull/1807 touch /opt/openshift/.bootkube.done echo "bootkube.service complete" diff --git a/data/data/bootstrap/files/usr/local/bin/bootstrap-service-record.sh b/data/data/bootstrap/files/usr/local/bin/bootstrap-service-record.sh new file mode 100644 index 00000000000..66cb04abed5 --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/bootstrap-service-record.sh @@ -0,0 +1,158 @@ +#!/usr/bin/env bash +# This library provides a helper functions for recording when a service +# and its stages start and end. + +### +# When running as a pre- or post-command, set PRE_COMMAND or POST_COMMAND, respectively. +# These must be set *prior* to sourcing this script. +# PRE_COMMAND is the name identifying the pre-command being run. +# POST_COMMAND is the name identifying the post-command being run. + +# SERVICE_RECORDS_DIR is the directory under which service records will be stored. +SERVICE_RECORDS_DIR="${SERVICE_RECORDS_DIR:-/var/log/openshift/}" +# SYSTEMD_UNIT_NAME is the name of the systemd unit for the service +SYSTEMD_UNIT_NAME="$(ps -o unit= $$)" +# SERVICE_NAME is the name of the service +SERVICE_NAME="${SERVICE_NAME:-${SYSTEMD_UNIT_NAME%.service}}" + +# add_service_record_entry adds a record entry to the service records file. +# PHASE - phase being recorded; one of "service start", "service end", "stage start", "stage end", "pre-command start", +# "pre-command end", "post-command start", "post-command end" +# RESULT - result of the action +# STAGE (optional) - stage of the service +# PRE_COMMAND (optional) - name of the pre-command +# POST_COMMAND (optional) - name of the post-command +# ERROR_LINE (optional) - line where the error occurred +# ERROR_MESSAGE (optional) - message for the error +add_service_record_entry() { + local FILENAME="${SERVICE_RECORDS_DIR}/${SERVICE_NAME}.json" + mkdir --parents "$(dirname "${FILENAME}")" + # Append the new entry to the existing array in the file. + # If the file does not already exist, start with an empty array. + # The new entry contains only the fields that have non-empty values, to omit optional values that were not provided. + ([ -f "${FILENAME}" ] && cat "${FILENAME}" || echo '[]') | \ + jq \ + --arg timestamp "$(date +"%Y-%m-%dT%H:%M:%SZ")" \ + --arg preCommand "${PRE_COMMAND-}" \ + --arg postCommand "${POST_COMMAND-}" \ + --arg stage "${STAGE_NAME-}" \ + --arg phase "${PHASE}" \ + --arg result "${RESULT-}" \ + --arg errorLine "${ERROR_LINE-}" \ + --arg errorMessage "${ERROR_MESSAGE-}" \ + '. += [ + {$timestamp,$preCommand,$postCommand,$stage,$phase,$result,$errorLine,$errorMessage} | + reduce keys[] as $k (.; if .[$k] == "" then del(.[$k]) else . end) + ]' \ + > "${FILENAME}.tmp" && \ + mv "${FILENAME}.tmp" "${FILENAME}" +} + +# record_service_start() records the start of a service. +record_service_start() { + if [ "${PRE_COMMAND-}" ] + then + local PHASE="pre-command start" + elif [ "${POST_COMMAND-}" ] + then + local PHASE="post-command start" + else + local PHASE="service start" + fi + + add_service_record_entry +} + +# record_service_end(result) records the end of a service. +# ERROR_LINE - line where the error occurred, if there was an error +# ERROR_MESSAGE - error message, if there was an error +record_service_end() { + if [ "${PRE_COMMAND-}" ] + then + local PHASE="pre-command end" + elif [ "${POST_COMMAND-}" ] + then + local PHASE="post-command end" + else + local PHASE="service end" + fi + local RESULT=${1:?Must specify a result} + + add_service_record_entry +} + +# record_service_stage_start(stage_name) records the start of a stage of a service. +record_service_stage_start() { + if [ "${STAGE_NAME-}" ] + then + echo "attempt to record the start of a stage without ending the previous one" + exit 1 + fi + + local PHASE="stage start" + STAGE_NAME=${1:?Must specify a stage name} + + add_service_record_entry +} + +# record_service_stage_end(result) records the end of a stage of a service. +# ERROR_LINE - line where the error occurred, if there was an error +# ERROR_MESSAGE - error message, if there was an error +record_service_stage_end() { + if [ -z "${STAGE_NAME-}" ] + then + echo "attempt to record the end of a stage without starting one" + exit 1 + fi + + local PHASE="stage end" + local RESULT=${1:?Must specify a result} + + add_service_record_entry + + STAGE_NAME="" +} + +# record_service_stage_success records the successful end of a stage of a service. +record_service_stage_success() { + record_service_stage_end "success" +} + +record_service_stage_failure() { + local ERROR_LINE + local ERROR_MESSAGE + get_error_info ERROR_LINE ERROR_MESSAGE + record_service_stage_end "failure" +} + +record_service_exit() { + if [ "$1" -eq 0 ] + then + local RESULT="success" + else + local RESULT="failure" + local ERROR_LINE + local ERROR_MESSAGE + get_error_info ERROR_LINE ERROR_MESSAGE + fi + + if [ "${STAGE_NAME-}" ] + then + record_service_stage_end "${RESULT}" + fi + + record_service_end "${RESULT}" +} + +get_error_info() { + local -n error_line=$1 + local -n error_message=$2 + # shellcheck disable=SC2034 # variable used indirectly + error_line="$(caller 1)" + # shellcheck disable=SC2034 # variable used indirectly + error_message="$(journalctl --unit="${SYSTEMD_UNIT_NAME}" --lines=3 --output=cat)" +} + +record_service_start + +trap 'record_service_exit $?' EXIT \ No newline at end of file diff --git a/data/data/bootstrap/files/usr/local/bin/crio-configure.sh.template b/data/data/bootstrap/files/usr/local/bin/crio-configure.sh.template index a6c0175f167..28bc1d2d8bd 100755 --- a/data/data/bootstrap/files/usr/local/bin/crio-configure.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/crio-configure.sh.template @@ -8,6 +8,8 @@ set -euo pipefail # Perhaps down the line we change this to run something like: # podman run machine-config-daemon bootstrap ... (passing the release image and the host rootfs) +. /usr/local/bin/bootstrap-service-record.sh + . /usr/local/bin/release-image.sh MACHINE_CONFIG_INFRA_IMAGE=$(image_for pod) diff --git a/data/data/bootstrap/files/usr/local/bin/installer-gather.sh b/data/data/bootstrap/files/usr/local/bin/installer-gather.sh index e6f3802db58..c9f99a0e8cc 100755 --- a/data/data/bootstrap/files/usr/local/bin/installer-gather.sh +++ b/data/data/bootstrap/files/usr/local/bin/installer-gather.sh @@ -11,6 +11,10 @@ mkdir -p "${ARTIFACTS}" exec &> >(tee "${ARTIFACTS}/gather.log") +echo "Gathering bootstrap service records ..." +mkdir -p "${ARTIFACTS}/bootstrap/services" +sudo cp -r /var/log/openshift/* "${ARTIFACTS}/bootstrap/services/" + echo "Gathering bootstrap systemd summary ..." LANG=POSIX systemctl list-units --state=failed >& "${ARTIFACTS}/failed-units.txt" diff --git a/data/data/bootstrap/files/usr/local/bin/kubelet-pause-image.sh.template b/data/data/bootstrap/files/usr/local/bin/kubelet-pause-image.sh.template index cffd8ee1aa9..93a318488a8 100755 --- a/data/data/bootstrap/files/usr/local/bin/kubelet-pause-image.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/kubelet-pause-image.sh.template @@ -5,6 +5,9 @@ set -euo pipefail # Need to set the --pod-infra-container-image flag for the kubelet to point to the pause image from the payload # So we add MACHINE_CONFIG_INFRA_IMAGE to an environment file and source that in the kubelet service +PRE_COMMAND="kubelet-pause-image" +. /usr/local/bin/bootstrap-service-record.sh + . /usr/local/bin/release-image.sh echo "MACHINE_CONFIG_INFRA_IMAGE=$(image_for pod)" > /etc/kubernetes/kubelet-pause-image-override diff --git a/data/data/bootstrap/files/usr/local/bin/kubelet.sh b/data/data/bootstrap/files/usr/local/bin/kubelet.sh new file mode 100644 index 00000000000..9af2b827202 --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/kubelet.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +# shellcheck disable=SC1091 # using path on bootstrap machine +. /usr/local/bin/bootstrap-service-record.sh + +/usr/bin/hyperkube \ + kubelet \ + --anonymous-auth=false \ + --container-runtime=remote \ + --container-runtime-endpoint=/var/run/crio/crio.sock \ + --runtime-request-timeout="${KUBELET_RUNTIME_REQUEST_TIMEOUT}" \ + --pod-manifest-path=/etc/kubernetes/manifests \ + --minimum-container-ttl-duration=6m0s \ + --cluster-domain=cluster.local \ + --cgroup-driver=systemd \ + --serialize-image-pulls=false \ + --v=2 \ + --volume-plugin-dir=/etc/kubernetes/kubelet-plugins/volume/exec \ + --pod-infra-container-image="${MACHINE_CONFIG_INFRA_IMAGE}" diff --git a/data/data/bootstrap/files/usr/local/bin/release-image-download.sh.template b/data/data/bootstrap/files/usr/local/bin/release-image-download.sh.template index 71a73484d29..d96c793faf2 100755 --- a/data/data/bootstrap/files/usr/local/bin/release-image-download.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/release-image-download.sh.template @@ -9,11 +9,20 @@ set -euo pipefail # service: https://github.com/systemd/systemd/issues/2582. # +. /usr/local/bin/bootstrap-service-record.sh + RELEASE_IMAGE={{.ReleaseImage}} echo "Pulling $RELEASE_IMAGE..." -while ! podman pull --quiet "$RELEASE_IMAGE" +while true do - echo "Pull failed. Retrying $RELEASE_IMAGE..." + record_service_stage_start "pull-release-image" + if podman pull --quiet "$RELEASE_IMAGE" + then + record_service_stage_success + break + else + record_service_stage_failure + echo "Pull failed. Retrying $RELEASE_IMAGE..." + fi done - diff --git a/data/data/bootstrap/files/usr/local/bin/report-progress.sh b/data/data/bootstrap/files/usr/local/bin/report-progress.sh index 16f662c63a0..7f625440aca 100755 --- a/data/data/bootstrap/files/usr/local/bin/report-progress.sh +++ b/data/data/bootstrap/files/usr/local/bin/report-progress.sh @@ -1,17 +1,23 @@ #!/usr/bin/env bash +# shellcheck disable=SC1091 # using path on bootstrap machine +. /usr/local/bin/bootstrap-service-record.sh + KUBECONFIG="${1}" -wait_for_existance() { +wait_for_existence() { while [ ! -e "${1}" ] do sleep 5 done } +record_service_stage_start "wait-for-bootstrap-complete" echo "Waiting for bootstrap to complete..." -wait_for_existance /opt/openshift/.bootkube.done +wait_for_existence /opt/openshift/.bootkube.done +record_service_stage_success +record_service_stage_start "report-bootstrap-complete" echo "Reporting install progress..." while ! oc --kubeconfig="$KUBECONFIG" create -f - <<-EOF apiVersion: v1 @@ -25,3 +31,4 @@ EOF do sleep 5 done +record_service_stage_success diff --git a/data/data/bootstrap/systemd/units/chown-gatewayd-key.service b/data/data/bootstrap/systemd/units/chown-gatewayd-key.service index 993414fbc72..d226bc4a2d0 100644 --- a/data/data/bootstrap/systemd/units/chown-gatewayd-key.service +++ b/data/data/bootstrap/systemd/units/chown-gatewayd-key.service @@ -9,7 +9,7 @@ Before=systemd-journal-gatewayd.service [Service] Type=oneshot RemainAfterExit=yes -ExecStart=/bin/sh -c "if ! getent passwd systemd-journal-gateway &>/dev/null; then useradd -r systemd-journal-gateway; fi && chown systemd-journal-gateway: /opt/openshift/tls/journal-gatewayd.{crt,key}" +ExecStart=/bin/sh -c ". /usr/local/bin/bootstrap-service-record.sh; if ! getent passwd systemd-journal-gateway &>/dev/null; then useradd -r systemd-journal-gateway; fi && chown systemd-journal-gateway: /opt/openshift/tls/journal-gatewayd.{crt,key}" [Install] WantedBy=multi-user.target diff --git a/data/data/bootstrap/systemd/units/kubelet.service.template b/data/data/bootstrap/systemd/units/kubelet.service.template index 1888ddeea8e..092d4c8e6e1 100644 --- a/data/data/bootstrap/systemd/units/kubelet.service.template +++ b/data/data/bootstrap/systemd/units/kubelet.service.template @@ -5,6 +5,7 @@ After=crio.service release-image.service [Service] Type=notify +NotifyAccess=all ExecStartPre=/bin/mkdir --parents /etc/kubernetes/manifests ExecStartPre=/bin/mkdir --parents /etc/kubernetes/kubelet-plugins/volume/exec ExecStartPre=/usr/local/bin/kubelet-pause-image.sh @@ -12,20 +13,7 @@ Environment=KUBELET_RUNTIME_REQUEST_TIMEOUT=10m EnvironmentFile=-/etc/kubernetes/kubelet-env EnvironmentFile=-/etc/kubernetes/kubelet-pause-image-override -ExecStart=/usr/bin/hyperkube \ - kubelet \ - --anonymous-auth=false \ - --container-runtime=remote \ - --container-runtime-endpoint=/var/run/crio/crio.sock \ - --runtime-request-timeout=${KUBELET_RUNTIME_REQUEST_TIMEOUT} \ - --pod-manifest-path=/etc/kubernetes/manifests \ - --minimum-container-ttl-duration=6m0s \ - --cluster-domain=cluster.local \ - --cgroup-driver=systemd \ - --serialize-image-pulls=false \ - --v=2 \ - --volume-plugin-dir=/etc/kubernetes/kubelet-plugins/volume/exec \ - --pod-infra-container-image=${MACHINE_CONFIG_INFRA_IMAGE} +ExecStart=/usr/local/bin/kubelet.sh Restart=always RestartSec=10 diff --git a/docs/dev/bootstrap_services.md b/docs/dev/bootstrap_services.md new file mode 100644 index 00000000000..97759a9e808 --- /dev/null +++ b/docs/dev/bootstrap_services.md @@ -0,0 +1,95 @@ +#### Bootstrap Service Records #### + +For the purposes of diagnosing installation failures that occur during bootstrapping, the +progresses of services running on the bootstrap machines are tracked in json files in the +/var/log/openshift directory. The progress for each service is tracked in its own file. For +example, the bootkube service progress is tracked in the /var/log/openshift/bootkube.json +file. The following progress events are tracked. +* A service adds an entry when the service starts. +* A service adds an entry when the service ends. The entry includes the result of the service +invocation, either success or failure. If the invocation failed, then the entry includes +the line number of the error and the last three lines from the service's journal log. +* A service adds an entry when a stage of the service starts. An example of a service stage +is the cvo-bootstrap stage of the bootkube service. During that stage, the +cluster-version-operator renders its manifests. +* A service adds an entry when a stage of the service ends. This is similar to the entry +added when a service ends, including having a result and error information if applicable. +* A service adds an entry when a pre- or post-command starts. +* A service adds en entry when a pre- or post-command ends. This is similar to the entry +added when a service ends, including having a result and error information if applicable. + +##### Managing Service Records in a Service ##### + +To track its progress, a service should source the /usr/local/bin/bootstrap-service-record.sh +script. When a service sources the script, the script will add an entry to the json file for +the service indicating that the service started. When the service ends, either successfully +or due to an error, the script will add an entry to the json file for the service indicating that +the service ended. The script will consider whether the last command executed was successful or +not in order to determine whether the service was successful. + +For tracking stages, the service should call functions from the sourced script. +* The service should call the `record_service_stage_start` function when a stage of the service +starts. The function takes the name of the stage as its single argument. +* The service should call the `record_service_stage_success` function when a stage of the service +ends successfully. +* The service should call the `record_service_stage_failure` function when a stage of the service +ends due to a failure. The script will automatically record an entry for a stage failure if the +service ends during the execution of a stage. + +###### Pre- and Post-Commands ###### + +If a service has pre- or post-commands that could either run for significant periods or could +potentially fail, then those commands should add to the json file as well. Such a command should +source the same /usr/local/bin/bootstrap-service-record.sh script. It should also set either the +`PRE_COMMAND` or `POST_COMMAND` environment variable with a value that identifies the command. +For example, kubelet.service has a pre-command of /usr/local/bin/kubelet-pause-image.sh. The +kubelet-pause-image.sh script sets the `PRE_COMMAND` environment variable to "kubelet-pause-image" +before sourcing the bootstrap-service-record.sh script. The entries for the pre-command will +contain a `preCommand` field with the "kubelet-pause-image" value. + +###### Sample Script ####### + +```shell script +#!/usr/bin/env bash +set -euoE pipefail + +# Source the script to record service entries. +# This will create en entry for the start of the service. +. /usr/local/bin/bootstrap-service-record.sh + +# Record the start of the "first" stage. +record_service_stage_start "first" + +# Record the successful end of the "first" stage. +record_service_stage_success + +while true +do + # Record the start of the "second" stage. + record_service_stage_start "second-stage" + + if [ some_check ] + then + # Record the successful end of the "second" stage. + record_service_stage_success + break + else + # Record the failing end of the "second" stage. + record_service_stage_failure + fi +done + +# Record the start of the third stage. +record_service_stage_start "third" + +# If the command fails, then an entry will be recorded for the failing end +# of the third stage and an entry will be recorded for the failing end of +# the service. +some_command_that_may_fail + +# Record the end of the third stage. +record_service_stage_success + +# Since this is the end of the script, an entry will be recorded for the +# successful end of the service. +``` \ No newline at end of file