Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 52 additions & 25 deletions pkg/gather/service/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@ import (
"encoding/json"
"io"
"os"
"regexp"
"strings"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)

// regex matching the path of a service entries file. The captured group is the name of the service.
// For example, if the filename is "log-bundle-20210329190553/bootstrap/services/release-image.json",
// then the name of the service is "release-image".
var serviceEntriesFilePathRegex = regexp.MustCompile(`^[^\/]+\/bootstrap\/services\/([^.]+)\.json$`)

// AnalyzeGatherBundle will analyze the bootstrap gather bundle at the specified path.
// Analysis will be logged.
// Returns an error if there was a problem reading the bundle.
Expand All @@ -35,7 +41,7 @@ func analyzeGatherBundle(bundleFile io.Reader) error {

// read through the tar for relevant files
tarReader := tar.NewReader(uncompressedStream)
var releaseImageAnalysis *analysis
serviceAnalyses := make(map[string]analysis)
for {
header, err := tarReader.Next()
if err == io.EOF {
Expand All @@ -47,36 +53,51 @@ func analyzeGatherBundle(bundleFile io.Reader) error {
if header.Typeflag != tar.TypeReg {
continue
}
filenameParts := strings.SplitN(header.Name, "/", 2)
if len(filenameParts) != 2 {

serviceEntriesFileSubmatch := serviceEntriesFilePathRegex.FindStringSubmatch(header.Name)
if serviceEntriesFileSubmatch == nil {
continue
}
// we only care about the release-image.service for now. in the future, we will look at other services, too.
if filenameParts[1] == "bootstrap/services/release-image.json" {
var err error
releaseImageAnalysis, err = analyzeService(tarReader)
if err != nil {
logrus.Infof("Could not analyze the release-image.service: %v", err)
}
break
serviceName := serviceEntriesFileSubmatch[1]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we using the second match (or submatch--whatever the right term is)?

Perhaps it is a safe assumption that there are at least two matches to this regex pattern but I think it would be a good idea to check the length as well when checking for nil above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The length of the submatches is determined by the regex. If the length is less than 2, that is a coding error not a runtime error. Throwing a panic is the right thing to do in the face of a coding error.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And the second submatch is the match of ([^.]+) in the overall ^[^\/]+\/bootstrap\/services\/([^.]+)\.json$ expression.


serviceAnalysis, err := analyzeService(tarReader)
if err != nil {
logrus.Infof("Could not analyze the %s.service: %v", serviceName, err)
continue
}

serviceAnalyses[serviceName] = serviceAnalysis
}

// log details about the release-image.service.
if releaseImageAnalysis != nil && releaseImageAnalysis.starts > 0 {
if !releaseImageAnalysis.successful {
logrus.Error("The bootstrap machine failed to download the release image")
for _, l := range strings.Split(releaseImageAnalysis.lastError, "\n") {
logrus.Info(l)
}
analysisChecks := []struct {
name string
check func(analysis) bool
}{
{name: "release-image", check: checkReleaseImageDownload},
}
for _, check := range analysisChecks {
a := serviceAnalyses[check.name]
if a.starts == 0 {
logrus.Errorf("The bootstrap machine did not execute the %s.service systemd unit", check.name)
break
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

once we add more checks then we want to remove this break, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. If the higher-priority service check determines that the service did not start, then we want to stop looking for errors in lower-priority service checks. For example, if we determine that the release-image service did not start, then we don't want to print errors that we find in the bootkube service, because any errors in the bootkube service can be traced back to the release-image service failing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I missed that the check priority was determined by the order in the slice. This makes sense now.

}
if !check.check(a) {
break
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This break too. IIUC these seem like optimizations based on the limited number of checks, which is fine for the time being.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same response as above, where this break is what we want long-term.

}
} else {
logrus.Error("The bootstrap machine did not execute the release-image.service systemd unit")
}

return nil
}

func checkReleaseImageDownload(a analysis) bool {
if a.successful {
return true
}
logrus.Error("The bootstrap machine failed to download the release image")
a.logLastError()
return false
}

type analysis struct {
// starts is the number of times that the service started
starts int
Expand All @@ -88,8 +109,8 @@ type analysis struct {
lastError string
}

func analyzeService(r io.Reader) (*analysis, error) {
a := &analysis{}
func analyzeService(r io.Reader) (analysis, error) {
a := analysis{}
decoder := json.NewDecoder(r)
t, err := decoder.Token()
if err != nil {
Expand All @@ -106,16 +127,16 @@ func analyzeService(r io.Reader) (*analysis, error) {
for decoder.More() {
entry := &Entry{}
if err := decoder.Decode(entry); err != nil {
return nil, errors.Wrap(err, "could not decode an entry in the service entries file")
return a, errors.Wrap(err, "could not decode an entry in the service entries file")
}

// record a new start of the service
if entry.Phase == ServiceStart {
a.starts++
}

// the service is only considered considered successful if the very last entry is either the service ending
// successfully or a post-command ending successfully.
// the service is only considered successful if the last entry is either the service ending successfully or a
// post-command ending successfully.
a.successful = entry.Result == Success && (entry.Phase == ServiceEnd || entry.Phase == PostCommandEnd)

// save the last error
Expand All @@ -131,3 +152,9 @@ func analyzeService(r io.Reader) (*analysis, error) {
}
return a, nil
}

func (a analysis) logLastError() {
for _, l := range strings.Split(a.lastError, "\n") {
logrus.Info(l)
}
}