Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 43 additions & 13 deletions pkg/jobrunaggregator/jobrunaggregatoranalyzer/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ import (
"strings"
"time"

"gopkg.in/yaml.v2"

"k8s.io/apimachinery/pkg/util/clock"
"sigs.k8s.io/yaml"

"github.com/openshift/ci-tools/pkg/jobrunaggregator/jobrunaggregatorapi"
"github.com/openshift/ci-tools/pkg/jobrunaggregator/jobrunaggregatorlib"
Expand Down Expand Up @@ -66,8 +67,9 @@ func (o *JobRunAggregatorAnalyzerOptions) getRelatedJobs(ctx context.Context) ([
func (o *JobRunAggregatorAnalyzerOptions) Run(ctx context.Context) error {
// if it hasn't been more than hour since the jobRuns started, the list isn't complete.
readyAt := o.jobRunStartEstimate.Add(1 * time.Hour)
timeToStopWaiting := o.jobRunStartEstimate.Add(3*time.Hour + 10*time.Minute)
Copy link
Contributor

@dgoodwin dgoodwin Sep 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eyeballing sippy and prow: I see some 4.10 Azure ovn upgrade jobs passing after 3:20 pretty consistently, so might need a little more time here.
AWS around this range, 2:45 -> 3:05,
GCP a little faster around 2:35 often.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eyeballing sippy and prow: I see some 4.10 Azure ovn upgrade jobs passing after 3:20 pretty consistently, so might

there is no more time. this is pressing up against the max ci-operator time

Copy link
Member

@wking wking Sep 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In openshift/release#22289, I'm talking about this a bit, and I think we might need to talk the test-platform folks into raising the Plank/Prow timeout above its current 4h, or giving us a way to do that for particular generated jobs (there's already a way to raise it for non-generated jobs via decoration_config.timeout).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And IIRC we're not planning on aggregating ovn-azure anyhow?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And IIRC we're not planning on aggregating ovn-azure anyhow?

we will


fmt.Printf("Aggregating job runs of type %q for %q. ReadyAt=%v, now=%v.\n", o.jobName, o.payloadTag, readyAt, o.clock.Now())
fmt.Printf("Aggregating job runs of type %q for %q. now=%v, ReadyAt=%v, timeToStopWaiting=%v.\n", o.jobName, o.payloadTag, o.clock.Now(), readyAt, timeToStopWaiting)
ctx, cancel := context.WithTimeout(ctx, o.timeout)
defer cancel()

Expand All @@ -78,11 +80,13 @@ func (o *JobRunAggregatorAnalyzerOptions) Run(ctx context.Context) error {

var finishedJobsToAggregate []jobrunaggregatorapi.JobRunInfo
var finishedJobRunNames []string
var unfinishedJobNames []string
for { // TODO extract to a method.
fmt.Println() // for prettier logs
// reset vars
finishedJobsToAggregate = []jobrunaggregatorapi.JobRunInfo{}
finishedJobRunNames = []string{}
unfinishedJobNames = []string{}

relatedJobs, err := o.getRelatedJobs(ctx)
if err != nil {
Expand All @@ -100,7 +104,6 @@ func (o *JobRunAggregatorAnalyzerOptions) Run(ctx context.Context) error {
return fmt.Errorf("%q for %q: found no related jobRuns", o.jobName, o.payloadTag)
}

unfinishedJobNames := []string{}
for i := range relatedJobs {
relatedJob := relatedJobs[i]
if !relatedJob.IsFinished(ctx) {
Expand All @@ -124,6 +127,12 @@ func (o *JobRunAggregatorAnalyzerOptions) Run(ctx context.Context) error {
finishedJobRunNames = append(finishedJobRunNames, relatedJob.GetJobRunID())
}

// ready or not, it's time to check
if o.clock.Now().After(timeToStopWaiting) {
fmt.Printf("%q for %q: waited long enough. Ready or not, here I come. (readyOrNot=%v now=%v)\n", o.jobName, o.payloadTag, timeToStopWaiting, o.clock.Now())
break
}

if len(unfinishedJobNames) > 0 {
fmt.Printf("%q for %q: found %d unfinished related jobRuns: %v\n", o.jobName, o.payloadTag, len(unfinishedJobNames), strings.Join(unfinishedJobNames, ", "))
time.Sleep(2 * time.Minute)
Expand All @@ -133,9 +142,29 @@ func (o *JobRunAggregatorAnalyzerOptions) Run(ctx context.Context) error {
break
}

if len(unfinishedJobNames) > 0 {
fmt.Printf("%q for %q: found %d unfinished related jobRuns: %v\n", o.jobName, o.payloadTag, len(unfinishedJobNames), strings.Join(unfinishedJobNames, ", "))
}
// if more than three jobruns timed out, just fail the entire aggregation
if len(unfinishedJobNames) > 3 {
return fmt.Errorf("%q for %q: found %d unfinished related jobRuns: %v\n", o.jobName, o.payloadTag, len(unfinishedJobNames), strings.Join(unfinishedJobNames, ", "))
}
fmt.Printf("%q for %q: aggregating %d related jobRuns: %v\n", o.jobName, o.payloadTag, len(finishedJobsToAggregate), strings.Join(finishedJobRunNames, ", "))

aggregationConfiguration := &AggregationConfiguration{}
for _, jobRunName := range unfinishedJobNames {
aggregationConfiguration.FinishedJobs = append(
aggregationConfiguration.FinishedJobs,
JobRunInfo{
JobName: o.jobName,
JobRunID: jobRunName,
HumanURL: jobrunaggregatorapi.GetHumanURL(o.jobName, jobRunName),
GCSBucketURL: jobrunaggregatorapi.GetGCSArtifactURL(o.jobName, jobRunName),
Status: "unknown",
},
)
}

currentAggregationJunit := &aggregatedJobRunJunit{}
for i := range finishedJobsToAggregate {
jobRun := finishedJobsToAggregate[i]
Expand All @@ -147,8 +176,8 @@ func (o *JobRunAggregatorAnalyzerOptions) Run(ctx context.Context) error {
if err != nil {
return err
}
aggregationConfiguration.IndividualJobs = append(
aggregationConfiguration.IndividualJobs,
aggregationConfiguration.FinishedJobs = append(
aggregationConfiguration.FinishedJobs,
JobRunInfo{
JobName: jobRun.GetJobName(),
JobRunID: jobRun.GetJobRunID(),
Expand All @@ -161,27 +190,28 @@ func (o *JobRunAggregatorAnalyzerOptions) Run(ctx context.Context) error {
currentAggregationJunit.addJobRun(jobrunaggregatorlib.GetPayloadTagFromProwJob(prowJob), currJunit)
}

fmt.Printf("%q for %q: aggregating junit tests.\n", o.jobName, o.payloadTag)
currentAggregationJunitSuites, err := currentAggregationJunit.aggregateAllJobRuns()
// write out the jobruns aggregated by this jobrun.
aggregationConfigYAML, err := yaml.Marshal(aggregationConfiguration)
if err != nil {
return err
}
if err := assignPassFail(ctx, currentAggregationJunitSuites, o.passFailCalculator); err != nil {
if err := ioutil.WriteFile(filepath.Join(currentAggregationDir, "aggregation-config.yaml"), aggregationConfigYAML, 0644); err != nil {
return err
}
currentAggrationJunitXML, err := xml.Marshal(currentAggregationJunitSuites)

fmt.Printf("%q for %q: aggregating junit tests.\n", o.jobName, o.payloadTag)
currentAggregationJunitSuites, err := currentAggregationJunit.aggregateAllJobRuns()
if err != nil {
return err
}
if err := ioutil.WriteFile(filepath.Join(currentAggregationDir, "junit-aggregated.xml"), currentAggrationJunitXML, 0644); err != nil {
if err := assignPassFail(ctx, currentAggregationJunitSuites, o.passFailCalculator); err != nil {
return err
}

aggregationConfigYAML, err := yaml.Marshal(aggregationConfiguration)
currentAggrationJunitXML, err := xml.Marshal(currentAggregationJunitSuites)
if err != nil {
return err
}
if err := ioutil.WriteFile(filepath.Join(currentAggregationDir, "aggregation-config.yaml"), aggregationConfigYAML, 0644); err != nil {
if err := ioutil.WriteFile(filepath.Join(currentAggregationDir, "junit-aggregated.xml"), currentAggrationJunitXML, 0644); err != nil {
return err
}

Expand Down
3 changes: 2 additions & 1 deletion pkg/jobrunaggregator/jobrunaggregatoranalyzer/types.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package jobrunaggregatoranalyzer

type AggregationConfiguration struct {
IndividualJobs []JobRunInfo
UnfinishedJobs []JobRunInfo
FinishedJobs []JobRunInfo
}

type JobRunInfo struct {
Expand Down