From 50c02619c91b0cc9e976f2f204593da8556a467e Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Thu, 29 May 2025 15:21:02 -0400 Subject: [PATCH 1/2] gather: collect logs & analyze node-image-pull As part of the overlay node image, a new service was introduced to pull the node image in 60c63bb8bc4a2481310dea0d8b28da6d25d055b4 This commit updates the installer gather and analyze to collect these logs and analyze them. --- data/data/bootstrap/files/usr/local/bin/installer-gather.sh | 2 +- pkg/gather/service/analyze.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/data/data/bootstrap/files/usr/local/bin/installer-gather.sh b/data/data/bootstrap/files/usr/local/bin/installer-gather.sh index 0eba2dfa325..02bf7d49d94 100755 --- a/data/data/bootstrap/files/usr/local/bin/installer-gather.sh +++ b/data/data/bootstrap/files/usr/local/bin/installer-gather.sh @@ -35,7 +35,7 @@ done echo "Gathering bootstrap journals ..." mkdir -p "${ARTIFACTS}/bootstrap/journals" for service in approve-csr bootkube crio crio-configure image-customization ironic ironic-dnsmasq ironic-httpd ironic-ramdisk-logs \ - kubelet master-bmh-update metal3-baremetal-operator release-image release-image-download sssd + kubelet master-bmh-update metal3-baremetal-operator release-image release-image-download sssd node-image-pull do journalctl --boot --no-pager --output=short --unit="${service}" > "${ARTIFACTS}/bootstrap/journals/${service}.log" done diff --git a/pkg/gather/service/analyze.go b/pkg/gather/service/analyze.go index 680dc5b6247..0151ac512ef 100644 --- a/pkg/gather/service/analyze.go +++ b/pkg/gather/service/analyze.go @@ -84,6 +84,7 @@ func analyzeGatherBundle(bundleFile io.Reader) error { check func(analysis) bool optional bool }{ + {name: "node-image-pull", check: checkReleaseImageDownload, optional: false}, {name: "release-image", check: checkReleaseImageDownload, optional: false}, {name: "bootkube", check: checkBootkubeService, optional: false}, } From 250b7ffe1608fea2eea6d707407c7beae2dd85e7 Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Mon, 27 Oct 2025 13:06:51 -0400 Subject: [PATCH 2/2] gather: add node-image-pull service Update analyze command to check for the failed node-image-pull service, so that users are presented with a helpful error message if they have a bad pull secret. --- .../usr/local/bin/node-image-pull.sh.template | 8 +++ pkg/gather/service/analyze.go | 11 +++- pkg/gather/service/analyze_test.go | 60 +++++++++++++------ 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template b/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template index ba7bcaf4198..6059d13aff8 100755 --- a/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template @@ -1,6 +1,11 @@ #!/bin/bash set -euo pipefail +# shellcheck source=bootstrap-service-record.sh +. /usr/local/bin/bootstrap-service-record.sh + +record_service_stage_start "node-image-pull" + # shellcheck source=release-image.sh.template . /usr/local/bin/release-image.sh @@ -64,6 +69,7 @@ ref=$(ostree refs --repo "${ostree_repo}" | grep ^ostree/container/image/docker) if [ $(echo "$ref" | wc -l) != 1 ]; then echo "Expected single docker ref, found:" echo "$ref" + record_service_stage_failure exit 1 fi ostree refs --repo "${ostree_repo}" "$ref" --create coreos/node-image @@ -88,3 +94,5 @@ if grep -q coreos.liveiso= /proc/cmdline; then echo "Deleting temporary repo" rm -rf "${ostree_repo}" fi + +record_service_stage_success diff --git a/pkg/gather/service/analyze.go b/pkg/gather/service/analyze.go index 0151ac512ef..f5365f80f87 100644 --- a/pkg/gather/service/analyze.go +++ b/pkg/gather/service/analyze.go @@ -84,8 +84,8 @@ func analyzeGatherBundle(bundleFile io.Reader) error { check func(analysis) bool optional bool }{ - {name: "node-image-pull", check: checkReleaseImageDownload, optional: false}, {name: "release-image", check: checkReleaseImageDownload, optional: false}, + {name: "node-image-pull", check: checkNodeImagePull, optional: false}, {name: "bootkube", check: checkBootkubeService, optional: false}, } for _, check := range analysisChecks { @@ -115,6 +115,15 @@ func checkReleaseImageDownload(a analysis) bool { return false } +func checkNodeImagePull(a analysis) bool { + if a.successful { + return true + } + logrus.Error("Node image pull failed on the bootstrap machine") + a.logLastError() + return false +} + // bootstrap-verify-api-servel-urls.sh is currently running as part of the bootkube service. // And the verification of the API and API-Int URLs are the only stage where a failure is // currently reported. So, here we are able to conclude that a failure corresponds to a diff --git a/pkg/gather/service/analyze_test.go b/pkg/gather/service/analyze_test.go index 4764783dc3a..3bad758c470 100644 --- a/pkg/gather/service/analyze_test.go +++ b/pkg/gather/service/analyze_test.go @@ -37,6 +37,15 @@ func failedReleaseImage() []logrus.Entry { } } +func failedNodeImagePull() []logrus.Entry { + return []logrus.Entry{ + {Level: logrus.ErrorLevel, Message: "Node image pull failed on the bootstrap machine"}, + {Level: logrus.InfoLevel, Message: "Line 1"}, + {Level: logrus.InfoLevel, Message: "Line 2"}, + {Level: logrus.InfoLevel, Message: "Line 3"}, + } +} + func failedURLChecks() []logrus.Entry { return []logrus.Entry{ {Level: logrus.InfoLevel, Message: "Line 1"}, @@ -69,56 +78,69 @@ func TestAnalyzeGatherBundle(t *testing.T) { { name: "bootkube not started", files: map[string]string{ - "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": "[]", + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/node-image-pull.json": generateSuccessOutput("node-image-pull"), + "log-bundle/bootstrap/services/bootkube.json": "[]", }, expectedOutput: []logrus.Entry{ {Level: logrus.ErrorLevel, Message: "The bootstrap machine did not execute the bootkube.service systemd unit"}, }, }, { - name: "release-image and API Server URL successful", + name: "release-image, node-image and API Server URL successful", files: map[string]string{ - "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": generateSuccessOutput("check-api-url"), + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/node-image-pull.json": generateSuccessOutput("node-image-pull"), + "log-bundle/bootstrap/services/bootkube.json": generateSuccessOutput("check-api-url"), }, }, { name: "release-image and API Server URL successful bootstrap-in-place", files: map[string]string{ - "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": generateSuccessOutput("check-api-url"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/node-image-pull.json": generateSuccessOutput("node-image-pull"), + "log-bundle/bootstrap/services/bootkube.json": generateSuccessOutput("check-api-url"), }, }, { name: "only release-image failed", files: map[string]string{ "log-bundle/bootstrap/services/release-image.json": generateFailureOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": generateSuccessOutput("check-api-url"), }, expectedOutput: failedReleaseImage(), }, + { + name: "only node-image-pull failed", + files: map[string]string{ + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/bootstrap/services/node-image-pull.json": generateFailureOutput("node-image-pull"), + }, + expectedOutput: failedNodeImagePull(), + }, { name: "API Server URL failed", files: map[string]string{ - "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-url"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/node-image-pull.json": generateSuccessOutput("node-image-pull"), + "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-url"), }, expectedOutput: failedURLChecks(), }, { name: "API-INT Server URL failed", files: map[string]string{ - "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-int-url"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/node-image-pull.json": generateSuccessOutput("node-image-pull"), + "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-int-url"), }, expectedOutput: failedURLChecks(), }, { name: "both release-image and API Server URLs failed", files: map[string]string{ - "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateFailureOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-url"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateFailureOutput("pull-release-image"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/node-image-pull.json": generateFailureOutput("node-image-pull"), + "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-url"), }, expectedOutput: failedReleaseImage(), }, @@ -135,8 +157,9 @@ func TestAnalyzeGatherBundle(t *testing.T) { { name: "empty bootkube.json", files: map[string]string{ - "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": "", + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/node-image-pull.json": generateSuccessOutput("node-image-pull"), + "log-bundle/bootstrap/services/bootkube.json": "", }, expectedOutput: []logrus.Entry{ {Level: logrus.InfoLevel, Message: "Could not analyze the bootkube.service: service entries file does not begin with a token: EOF"}, @@ -156,8 +179,9 @@ func TestAnalyzeGatherBundle(t *testing.T) { { name: "malformed bootkube.json", files: map[string]string{ - "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), - "log-bundle/bootstrap/services/bootkube.json": "{}", + "log-bundle/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), + "log-bundle/log-bundle-bootstrap/bootstrap/services/node-image-pull.json": generateSuccessOutput("node-image-pull"), + "log-bundle/bootstrap/services/bootkube.json": "{}", }, expectedOutput: []logrus.Entry{ {Level: logrus.InfoLevel, Message: "Could not analyze the bootkube.service: service entries file does not begin with an array"},