From 790ffe4f15ac7a94fdcef120ec8243cea3afedba Mon Sep 17 00:00:00 2001 From: Steve Kuznetsov Date: Wed, 2 Dec 2020 07:39:45 -0800 Subject: [PATCH] prow: sidecar: allow ignoring interrupts When a test process is well-formed - if it exits quickly and cleanly before the grace period is over once an interrupt is sent - the upload can simply wait on it, instead of beginning the upload straight away. Signed-off-by: Steve Kuznetsov --- prow/sidecar/options.go | 29 +++++++++++++++++++++++++++++ prow/sidecar/run.go | 20 ++++++++++++-------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/prow/sidecar/options.go b/prow/sidecar/options.go index f814a2e7f4e7..cc901af22e37 100644 --- a/prow/sidecar/options.go +++ b/prow/sidecar/options.go @@ -46,6 +46,35 @@ type Options struct { // EntryError requires all entries to pass in order to exit cleanly. EntryError bool `json:"entry_error,omitempty"` + + // IgnoreInterrupts instructs the waiting process to ignore interrupt + // signals. An interrupt signal is sent to this process when the kubelet + // decides to delete the test Pod. This may be as a result of: + // - the ProwJob exceeding the `default_job_timeout` as configured for Prow + // - the ProwJob exceeding the `timeout` as configured for the job itself + // - the Pod exceeding the `pod_running_timeout` as configured for Prow + // - cluster instability causing the Pod to be evicted + // + // When this happens, the `entrypoint` process also gets the signal, and + // forwards it to the process under test. `entrypoint` will wait for the + // test process to exit, either configured with: + // - `grace_period` in the default decoration configurations for Prow + // - `grace_period` in the job's specific configuration + // After the grace period, `entrypoint` will forcefully terminate the test + // process and signal to `sidecar` that the process has exited. + // + // In parallel, the kubelet will be waiting on the Pod's `terminationGracePeriod` + // after sending the interrupt signal, at which point the kubelet will forcefully + // terminate all containers in the Pod. + // + // If `ignore_interrupts` is set, `sidecar` will do nothing upon receipt of + // the interrupt signal; this implicitly means that upload of logs and artifacts + // will begin when the test process exits, which may be as long as the grace + // period if the test process does not gracefully handle interrupts. This will + // require that the user configures the Pod's termination grace period to be + // longer than the `entrypoint` grace period for the test process and the time + // taken by `sidecar` to upload all relevant artifacts. + IgnoreInterrupts bool `json:"ignore_interrupts,omitempty"` } func (o Options) entries() []wrapper.Options { diff --git a/prow/sidecar/run.go b/prow/sidecar/run.go index ba75058f32fd..c441eace47b7 100644 --- a/prow/sidecar/run.go +++ b/prow/sidecar/run.go @@ -79,19 +79,23 @@ func (o Options) Run(ctx context.Context) (int, error) { ctx, cancel := context.WithCancel(ctx) - // If we are being asked to terminate by the kubelet but we have - // NOT seen the test process exit cleanly, we need a to start - // uploading artifacts to GCS immediately. If we notice the process - // exit while doing this best-effort upload, we can race with the - // second upload but we can tolerate this as we'd rather get SOME - // data into GCS than attempt to cancel these uploads and get none. interrupt := make(chan os.Signal) signal.Notify(interrupt, os.Interrupt, syscall.SIGTERM) go func() { select { case s := <-interrupt: - logrus.Errorf("Received an interrupt: %s, cancelling...", s) - cancel() + if o.IgnoreInterrupts { + logrus.Warnf("Received an interrupt: %s, ignoring...", s) + } else { + // If we are being asked to terminate by the kubelet but we have + // NOT seen the test process exit cleanly, we need a to start + // uploading artifacts to GCS immediately. If we notice the process + // exit while doing this best-effort upload, we can race with the + // second upload but we can tolerate this as we'd rather get SOME + // data into GCS than attempt to cancel these uploads and get none. + logrus.Errorf("Received an interrupt: %s, cancelling...", s) + cancel() + } case <-ctx.Done(): } }()