-
Notifications
You must be signed in to change notification settings - Fork 1.5k
cmd: diagnose problems downloading release image #4751
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
fdb04a7
271f86c
3441e2a
779c6ee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,12 +6,18 @@ import ( | |
| "encoding/json" | ||
| "io" | ||
| "os" | ||
| "regexp" | ||
| "strings" | ||
|
|
||
| "github.com/pkg/errors" | ||
| "github.com/sirupsen/logrus" | ||
| ) | ||
|
|
||
| // regex matching the path of a service entries file. The captured group is the name of the service. | ||
| // For example, if the filename is "log-bundle-20210329190553/bootstrap/services/release-image.json", | ||
| // then the name of the service is "release-image". | ||
| var serviceEntriesFilePathRegex = regexp.MustCompile(`^[^\/]+\/bootstrap\/services\/([^.]+)\.json$`) | ||
|
|
||
| // AnalyzeGatherBundle will analyze the bootstrap gather bundle at the specified path. | ||
| // Analysis will be logged. | ||
| // Returns an error if there was a problem reading the bundle. | ||
|
|
@@ -35,7 +41,7 @@ func analyzeGatherBundle(bundleFile io.Reader) error { | |
|
|
||
| // read through the tar for relevant files | ||
| tarReader := tar.NewReader(uncompressedStream) | ||
| var releaseImageAnalysis *analysis | ||
| serviceAnalyses := make(map[string]analysis) | ||
| for { | ||
| header, err := tarReader.Next() | ||
| if err == io.EOF { | ||
|
|
@@ -47,36 +53,51 @@ func analyzeGatherBundle(bundleFile io.Reader) error { | |
| if header.Typeflag != tar.TypeReg { | ||
| continue | ||
| } | ||
| filenameParts := strings.SplitN(header.Name, "/", 2) | ||
| if len(filenameParts) != 2 { | ||
|
|
||
| serviceEntriesFileSubmatch := serviceEntriesFilePathRegex.FindStringSubmatch(header.Name) | ||
| if serviceEntriesFileSubmatch == nil { | ||
| continue | ||
| } | ||
| // we only care about the release-image.service for now. in the future, we will look at other services, too. | ||
| if filenameParts[1] == "bootstrap/services/release-image.json" { | ||
| var err error | ||
| releaseImageAnalysis, err = analyzeService(tarReader) | ||
| if err != nil { | ||
| logrus.Infof("Could not analyze the release-image.service: %v", err) | ||
| } | ||
| break | ||
| serviceName := serviceEntriesFileSubmatch[1] | ||
|
|
||
| serviceAnalysis, err := analyzeService(tarReader) | ||
| if err != nil { | ||
| logrus.Infof("Could not analyze the %s.service: %v", serviceName, err) | ||
| continue | ||
| } | ||
|
|
||
| serviceAnalyses[serviceName] = serviceAnalysis | ||
| } | ||
|
|
||
| // log details about the release-image.service. | ||
| if releaseImageAnalysis != nil && releaseImageAnalysis.starts > 0 { | ||
| if !releaseImageAnalysis.successful { | ||
| logrus.Error("The bootstrap machine failed to download the release image") | ||
| for _, l := range strings.Split(releaseImageAnalysis.lastError, "\n") { | ||
| logrus.Info(l) | ||
| } | ||
| analysisChecks := []struct { | ||
| name string | ||
| check func(analysis) bool | ||
| }{ | ||
| {name: "release-image", check: checkReleaseImageDownload}, | ||
| } | ||
| for _, check := range analysisChecks { | ||
| a := serviceAnalyses[check.name] | ||
| if a.starts == 0 { | ||
| logrus.Errorf("The bootstrap machine did not execute the %s.service systemd unit", check.name) | ||
| break | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. once we add more checks then we want to remove this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. If the higher-priority service check determines that the service did not start, then we want to stop looking for errors in lower-priority service checks. For example, if we determine that the release-image service did not start, then we don't want to print errors that we find in the bootkube service, because any errors in the bootkube service can be traced back to the release-image service failing.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I missed that the check priority was determined by the order in the slice. This makes sense now. |
||
| } | ||
| if !check.check(a) { | ||
| break | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same response as above, where this break is what we want long-term. |
||
| } | ||
| } else { | ||
| logrus.Error("The bootstrap machine did not execute the release-image.service systemd unit") | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| func checkReleaseImageDownload(a analysis) bool { | ||
| if a.successful { | ||
| return true | ||
| } | ||
| logrus.Error("The bootstrap machine failed to download the release image") | ||
| a.logLastError() | ||
| return false | ||
| } | ||
|
|
||
| type analysis struct { | ||
| // starts is the number of times that the service started | ||
| starts int | ||
|
|
@@ -88,8 +109,8 @@ type analysis struct { | |
| lastError string | ||
| } | ||
|
|
||
| func analyzeService(r io.Reader) (*analysis, error) { | ||
| a := &analysis{} | ||
| func analyzeService(r io.Reader) (analysis, error) { | ||
| a := analysis{} | ||
| decoder := json.NewDecoder(r) | ||
| t, err := decoder.Token() | ||
| if err != nil { | ||
|
|
@@ -106,16 +127,16 @@ func analyzeService(r io.Reader) (*analysis, error) { | |
| for decoder.More() { | ||
| entry := &Entry{} | ||
| if err := decoder.Decode(entry); err != nil { | ||
| return nil, errors.Wrap(err, "could not decode an entry in the service entries file") | ||
| return a, errors.Wrap(err, "could not decode an entry in the service entries file") | ||
| } | ||
|
|
||
| // record a new start of the service | ||
| if entry.Phase == ServiceStart { | ||
| a.starts++ | ||
| } | ||
|
|
||
| // the service is only considered considered successful if the very last entry is either the service ending | ||
| // successfully or a post-command ending successfully. | ||
| // the service is only considered successful if the last entry is either the service ending successfully or a | ||
| // post-command ending successfully. | ||
| a.successful = entry.Result == Success && (entry.Phase == ServiceEnd || entry.Phase == PostCommandEnd) | ||
|
|
||
| // save the last error | ||
|
|
@@ -131,3 +152,9 @@ func analyzeService(r io.Reader) (*analysis, error) { | |
| } | ||
| return a, nil | ||
| } | ||
|
|
||
| func (a analysis) logLastError() { | ||
| for _, l := range strings.Split(a.lastError, "\n") { | ||
| logrus.Info(l) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are we using the second match (or submatch--whatever the right term is)?
Perhaps it is a safe assumption that there are at least two matches to this regex pattern but I think it would be a good idea to check the length as well when checking for nil above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The length of the submatches is determined by the regex. If the length is less than 2, that is a coding error not a runtime error. Throwing a panic is the right thing to do in the face of a coding error.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And the second submatch is the match of
([^.]+)in the overall^[^\/]+\/bootstrap\/services\/([^.]+)\.json$expression.