Skip to content

Commit

Permalink
Merge pull request #20093 from stevekuznetsov/skuznets/interrupt-beha…
Browse files Browse the repository at this point in the history
…vior-gcsupload

prow: sidecar: allow ignoring interrupts
  • Loading branch information
k8s-ci-robot authored Dec 2, 2020
2 parents 1bfb2ee + 790ffe4 commit 2a5710e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 8 deletions.
29 changes: 29 additions & 0 deletions prow/sidecar/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,35 @@ type Options struct {

// EntryError requires all entries to pass in order to exit cleanly.
EntryError bool `json:"entry_error,omitempty"`

// IgnoreInterrupts instructs the waiting process to ignore interrupt
// signals. An interrupt signal is sent to this process when the kubelet
// decides to delete the test Pod. This may be as a result of:
// - the ProwJob exceeding the `default_job_timeout` as configured for Prow
// - the ProwJob exceeding the `timeout` as configured for the job itself
// - the Pod exceeding the `pod_running_timeout` as configured for Prow
// - cluster instability causing the Pod to be evicted
//
// When this happens, the `entrypoint` process also gets the signal, and
// forwards it to the process under test. `entrypoint` will wait for the
// test process to exit, either configured with:
// - `grace_period` in the default decoration configurations for Prow
// - `grace_period` in the job's specific configuration
// After the grace period, `entrypoint` will forcefully terminate the test
// process and signal to `sidecar` that the process has exited.
//
// In parallel, the kubelet will be waiting on the Pod's `terminationGracePeriod`
// after sending the interrupt signal, at which point the kubelet will forcefully
// terminate all containers in the Pod.
//
// If `ignore_interrupts` is set, `sidecar` will do nothing upon receipt of
// the interrupt signal; this implicitly means that upload of logs and artifacts
// will begin when the test process exits, which may be as long as the grace
// period if the test process does not gracefully handle interrupts. This will
// require that the user configures the Pod's termination grace period to be
// longer than the `entrypoint` grace period for the test process and the time
// taken by `sidecar` to upload all relevant artifacts.
IgnoreInterrupts bool `json:"ignore_interrupts,omitempty"`
}

func (o Options) entries() []wrapper.Options {
Expand Down
20 changes: 12 additions & 8 deletions prow/sidecar/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,19 +79,23 @@ func (o Options) Run(ctx context.Context) (int, error) {

ctx, cancel := context.WithCancel(ctx)

// If we are being asked to terminate by the kubelet but we have
// NOT seen the test process exit cleanly, we need a to start
// uploading artifacts to GCS immediately. If we notice the process
// exit while doing this best-effort upload, we can race with the
// second upload but we can tolerate this as we'd rather get SOME
// data into GCS than attempt to cancel these uploads and get none.
interrupt := make(chan os.Signal)
signal.Notify(interrupt, os.Interrupt, syscall.SIGTERM)
go func() {
select {
case s := <-interrupt:
logrus.Errorf("Received an interrupt: %s, cancelling...", s)
cancel()
if o.IgnoreInterrupts {
logrus.Warnf("Received an interrupt: %s, ignoring...", s)
} else {
// If we are being asked to terminate by the kubelet but we have
// NOT seen the test process exit cleanly, we need a to start
// uploading artifacts to GCS immediately. If we notice the process
// exit while doing this best-effort upload, we can race with the
// second upload but we can tolerate this as we'd rather get SOME
// data into GCS than attempt to cancel these uploads and get none.
logrus.Errorf("Received an interrupt: %s, cancelling...", s)
cancel()
}
case <-ctx.Done():
}
}()
Expand Down

0 comments on commit 2a5710e

Please sign in to comment.