diff --git a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template index 7e928b90229..51dcb8ae8bd 100755 --- a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template @@ -5,6 +5,7 @@ set -euoE pipefail ## -E option will cause functions to inherit trap . /usr/local/bin/release-image.sh . /usr/local/bin/bootstrap-cluster-gather.sh +. /usr/local/bin/bootstrap-verify-api-server-urls.sh mkdir --parents /etc/kubernetes/{manifests,bootstrap-configs,bootstrap-manifests} @@ -356,6 +357,19 @@ then record_service_stage_success fi +# Check if the API and API_INT Server URLs can be resolved and reached. +echo "Check if API and API-Int URLs are resolvable during bootstrap" +API_SERVER_URL="{{.APIServerURL}}" +API_INT_SERVER_URL="{{.APIIntServerURL}}" + +if [[ ! -z "${API_SERVER_URL}" ]] ; then + check_url "API_URL" "${API_SERVER_URL}" +fi + +if [[ ! -z "${API_INT_SERVER_URL}" ]] ; then + check_url "API_INT_URL" "${API_INT_SERVER_URL}" +fi + if [ ! -f cco-bootstrap.done ] then record_service_stage_start "cco-bootstrap" diff --git a/data/data/bootstrap/files/usr/local/bin/bootstrap-verify-api-server-urls.sh b/data/data/bootstrap/files/usr/local/bin/bootstrap-verify-api-server-urls.sh new file mode 100644 index 00000000000..8805221a12b --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/bootstrap-verify-api-server-urls.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +# shellcheck disable=SC1091 +. /usr/local/bin/bootstrap-service-record.sh + +# This functions expects 2 arguments: +# 1. name of the URL +# 2. The value of the URL +function resolve_url() { + unset IPS + unset IP + IPS=$(dig "${2}" +short) + if [[ ! -z "${IPS}" ]] ; then + echo "Successfully resolved ${1} ${2}" + # dig returns multiple IPs. Check if the + # first IP is reachable. + ip_arr="" + readarray ip_arr -t <<<"${IPS}" + IP="$(echo "${ip_arr[0]}" | tr -d '\n')" + return 0 + else + echo "Unable to resolve ${1} ${2}" + return 1 + fi +} + +# This functions expects 2 arguments: +# 1. name of the URL +# 2. URL to validate +function validate_url() { + if [[ $(curl --head -k --silent --fail --write-out "%{http_code}\\n" "${2}" -o /dev/null) == 200 ]]; then + echo "Success while trying to reach ${1}'s https endpoint at ${2}" + return 0 + else + echo "Unable to reach ${1}'s https endpoint at ${2}" + return 1 + fi +} + +function check_url() { + if [[ -z "${1}" ]] || [[ -z "${2}" ]]; then + echo "Usage: check_url " + return + fi + + local URL_TYPE=${1} + local SERVER_URL=${2} + + if [[ ${URL_TYPE} != API_URL ]] && [[ ${URL_TYPE} != API_INT_URL ]]; then + echo "Usage: check_url " + return + fi + + echo "Checking validity of ${SERVER_URL} of type ${URL_TYPE}" + + if [[ "${URL_TYPE}" = "API_URL" ]]; then + local URL_STAGE_NAME="check-api-url" + else + local URL_STAGE_NAME="check-api-int-url" + fi + + echo "Starting stage ${URL_STAGE_NAME}" + record_service_stage_start ${URL_STAGE_NAME} + if resolve_url "$URL_TYPE" "$SERVER_URL"; then + record_service_stage_success + else + record_service_stage_failure + # We do not want to stop bootkube service due to this failure. + # So not returning failure at this point. + return + fi + + CURL_URL="https://${IP}:6443/version" + + record_service_stage_start ${URL_STAGE_NAME} + if validate_url "$URL_TYPE" "$CURL_URL"; then + record_service_stage_success + else + echo "It might be too early for the ${CURL_URL} to be available." + record_service_stage_failure + fi +} diff --git a/pkg/asset/ignition/bootstrap/common.go b/pkg/asset/ignition/bootstrap/common.go index c0b76f8808e..1d261aeb6dd 100644 --- a/pkg/asset/ignition/bootstrap/common.go +++ b/pkg/asset/ignition/bootstrap/common.go @@ -80,6 +80,8 @@ type bootstrapTemplateData struct { UseIPv6ForNodeIP bool IsOKD bool BootstrapNodeIP string + APIServerURL string + APIIntServerURL string } // platformTemplateData is the data to use to replace values in bootstrap @@ -285,6 +287,9 @@ func (a *Common) getTemplateData(dependencies asset.Parents, bootstrapInPlace bo if bootstrapInPlace { bootstrapInPlaceConfig = installConfig.Config.BootstrapInPlace } + + apiURL := fmt.Sprintf("api.%s", installConfig.Config.ClusterDomain()) + apiIntURL := fmt.Sprintf("api-int.%s", installConfig.Config.ClusterDomain()) return &bootstrapTemplateData{ AdditionalTrustBundle: installConfig.Config.AdditionalTrustBundle, FIPS: installConfig.Config.FIPS, @@ -301,6 +306,8 @@ func (a *Common) getTemplateData(dependencies asset.Parents, bootstrapInPlace bo UseIPv6ForNodeIP: APIIntVIPonIPv6, IsOKD: installConfig.Config.IsOKD(), BootstrapNodeIP: bootstrapNodeIP, + APIServerURL: apiURL, + APIIntServerURL: apiIntURL, } } diff --git a/pkg/gather/service/analyze.go b/pkg/gather/service/analyze.go index 83e98dc2784..ea851c6f7ff 100644 --- a/pkg/gather/service/analyze.go +++ b/pkg/gather/service/analyze.go @@ -72,14 +72,20 @@ func analyzeGatherBundle(bundleFile io.Reader) error { } analysisChecks := []struct { - name string - check func(analysis) bool + name string + check func(analysis) bool + optional bool }{ - {name: "release-image", check: checkReleaseImageDownload}, + {name: "release-image", check: checkReleaseImageDownload, optional: false}, + {name: "bootkube", check: checkAPIURLs, optional: false}, } for _, check := range analysisChecks { a := serviceAnalyses[check.name] if a.starts == 0 { + if check.optional { + logrus.Infof("The bootstrap machine did not execute the %s.service systemd unit", check.name) + break + } logrus.Errorf("The bootstrap machine did not execute the %s.service systemd unit", check.name) break } @@ -100,6 +106,23 @@ func checkReleaseImageDownload(a analysis) bool { return false } +// bootstrap-verify-api-servel-urls.sh is currently running as part of the bootkube service. +// And the verification of the API and API-Int URLs are the only stage where a failure is +// currently reported. So, here we are able to conclude that a failure corresponds to a +// failure to resolve either the API URL or API-Int URL or both. If that changes and if +// any other stage in the bootkube service starts reporting a failure, we need to revisit +// this. At that point verification of the URLs could be moved to its own service. +func checkAPIURLs(a analysis) bool { + if a.successful { + return true + } + // Note: Even when there is a stage failure, we are not returning false here. That is + // intentional because we donot want to report this as an error in the "analyze" output. + logrus.Warn("The bootstrap machine is unable to resolve API and/or API-Int Server URLs") + a.logLastError() + return true +} + type analysis struct { // starts is the number of times that the service started starts int diff --git a/pkg/gather/service/analyze_test.go b/pkg/gather/service/analyze_test.go index e44d3759afd..cdee040d84a 100644 --- a/pkg/gather/service/analyze_test.go +++ b/pkg/gather/service/analyze_test.go @@ -11,6 +11,41 @@ import ( "github.com/stretchr/testify/assert" ) +func generateSuccessOutput(stage string) string { + return `[ +{"phase":"service start"}, +{"phase":"stage start", "stage":"` + stage + `"}, +{"phase":"stage end", "stage":"` + stage + `", "result":"success"}, +{"phase":"service end", "result":"success"} +]` +} + +func generateFailureOutput(stage string) string { + return `[ +{"phase":"service start"}, +{"phase":"stage start", "stage":"` + stage + `"}, +{"phase":"stage end", "stage":"` + stage + `", "result":"failure", "errorMessage":"Line 1\nLine 2\nLine 3"} +]` +} + +func failedReleaseImage() []logrus.Entry { + return []logrus.Entry{ + {Level: logrus.ErrorLevel, Message: "The bootstrap machine failed to download the release image"}, + {Level: logrus.InfoLevel, Message: "Line 1"}, + {Level: logrus.InfoLevel, Message: "Line 2"}, + {Level: logrus.InfoLevel, Message: "Line 3"}, + } +} + +func failedURLChecks() []logrus.Entry { + return []logrus.Entry{ + {Level: logrus.WarnLevel, Message: "The bootstrap machine is unable to resolve API and/or API-Int Server URLs"}, + {Level: logrus.InfoLevel, Message: "Line 1"}, + {Level: logrus.InfoLevel, Message: "Line 2"}, + {Level: logrus.InfoLevel, Message: "Line 3"}, + } +} + func TestAnalyzeGatherBundle(t *testing.T) { cases := []struct { name string @@ -33,42 +68,60 @@ func TestAnalyzeGatherBundle(t *testing.T) { }, }, { - name: "release-image successful", + name: "bootkube not started", files: map[string]string{ - "log-bundle/bootstrap/services/release-image.json": `[ -{"phase":"service start"}, -{"phase":"stage start", "stage":"pull-release-image"}, -{"phase":"stage end", "stage":"pull-release-image", "result":"success"}, -{"phase":"service end", "result":"success"} -]`, + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": "[]", + }, + expectedOutput: []logrus.Entry{ + {Level: logrus.ErrorLevel, Message: "The bootstrap machine did not execute the bootkube.service systemd unit"}, }, }, { - name: "release-image successful bootstrap-in-place", + name: "release-image and API Server URL successful", files: map[string]string{ - "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": `[ -{"phase":"service start"}, -{"phase":"stage start", "stage":"pull-release-image"}, -{"phase":"stage end", "stage":"pull-release-image", "result":"success"}, -{"phase":"service end", "result":"success"} -]`, + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": generateSuccessOutput("check-api-url"), }, }, { - name: "release-image failed", + name: "release-image and API Server URL successful bootstrap-in-place", files: map[string]string{ - "log-bundle/bootstrap/services/release-image.json": `[ -{"phase":"service start"}, -{"phase":"stage start", "stage":"pull-release-image"}, -{"phase":"stage end", "stage":"pull-release-image", "result":"failure", "errorMessage":"Line 1\nLine 2\nLine 3"} -]`, + "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": generateSuccessOutput("check-api-url"), }, - expectedOutput: []logrus.Entry{ - {Level: logrus.ErrorLevel, Message: "The bootstrap machine failed to download the release image"}, - {Level: logrus.InfoLevel, Message: "Line 1"}, - {Level: logrus.InfoLevel, Message: "Line 2"}, - {Level: logrus.InfoLevel, Message: "Line 3"}, + }, + { + name: "only release-image failed", + files: map[string]string{ + "log-bundle/bootstrap/services/release-image.json": generateFailureOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": generateSuccessOutput("check-api-url"), }, + expectedOutput: failedReleaseImage(), + }, + { + name: "API Server URL failed", + files: map[string]string{ + "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-url"), + }, + expectedOutput: failedURLChecks(), + }, + { + name: "API-INT Server URL failed", + files: map[string]string{ + "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-int-url"), + }, + expectedOutput: failedURLChecks(), + }, + { + name: "both release-image and API Server URLs failed", + files: map[string]string{ + "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateFailureOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-url"), + }, + expectedOutput: failedReleaseImage(), }, { name: "empty release-image.json", @@ -80,6 +133,17 @@ func TestAnalyzeGatherBundle(t *testing.T) { {Level: logrus.ErrorLevel, Message: "The bootstrap machine did not execute the release-image.service systemd unit"}, }, }, + { + name: "empty bootkube.json", + files: map[string]string{ + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": "", + }, + expectedOutput: []logrus.Entry{ + {Level: logrus.InfoLevel, Message: "Could not analyze the bootkube.service: service entries file does not begin with a token: EOF"}, + {Level: logrus.ErrorLevel, Message: "The bootstrap machine did not execute the bootkube.service systemd unit"}, + }, + }, { name: "malformed release-image.json", files: map[string]string{ @@ -90,6 +154,17 @@ func TestAnalyzeGatherBundle(t *testing.T) { {Level: logrus.ErrorLevel, Message: "The bootstrap machine did not execute the release-image.service systemd unit"}, }, }, + { + name: "malformed bootkube.json", + files: map[string]string{ + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/bootkube.json": "{}", + }, + expectedOutput: []logrus.Entry{ + {Level: logrus.InfoLevel, Message: "Could not analyze the bootkube.service: service entries file does not begin with an array"}, + {Level: logrus.ErrorLevel, Message: "The bootstrap machine did not execute the bootkube.service systemd unit"}, + }, + }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) {