-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Fix opsgenie plugin to use correct annotation to determine whether or not to create alerts #38435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
69f7c35
c7ae046
cb75dd9
38ad63b
d0798ef
c37e82b
0daf76c
759d35e
92187ff
bf21892
a7df8ff
7b87ee9
0462903
1d41ac3
6bb8aca
a029ebd
6dc6241
33e83a1
443703f
57a5a7a
170d55c
fe4d8e8
2598ccc
293f65b
27de3fc
0f3ff90
1a8c5b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,17 +30,24 @@ import ( | |
| "github.com/aws/aws-sdk-go/aws/defaults" | ||
| "github.com/go-resty/resty/v2" | ||
| "github.com/gravitational/trace" | ||
| "github.com/jonboulle/clockwork" | ||
|
|
||
| "github.com/gravitational/teleport/api/types" | ||
| "github.com/gravitational/teleport/integrations/access/common" | ||
| "github.com/gravitational/teleport/integrations/lib" | ||
| "github.com/gravitational/teleport/integrations/lib/backoff" | ||
| "github.com/gravitational/teleport/integrations/lib/logger" | ||
| ) | ||
|
|
||
| const ( | ||
| // alertKeyPrefix is the prefix for Alert's alias field used when creating an Alert. | ||
| alertKeyPrefix = "teleport-access-request" | ||
| heartbeatName = "teleport-access-heartbeat" | ||
| alertKeyPrefix = "teleport-access-request" | ||
| heartbeatName = "teleport-access-heartbeat" | ||
| ResponderTypeSchedule = "schedule" | ||
| ResponderTypeUser = "user" | ||
|
|
||
| ResolveAlertRequestRetryInterval = time.Second * 10 | ||
| ResolveAlertRequestRetryTimeout = time.Minute * 2 | ||
| ) | ||
|
|
||
| var alertBodyTemplate = template.Must(template.New("alert body").Parse( | ||
|
|
@@ -135,11 +142,11 @@ func (og Client) CreateAlert(ctx context.Context, reqID string, reqData RequestD | |
| Message: fmt.Sprintf("Access request from %s", reqData.User), | ||
| Alias: fmt.Sprintf("%s/%s", alertKeyPrefix, reqID), | ||
| Description: bodyDetails, | ||
| Responders: og.getResponders(reqData), | ||
| Responders: og.getScheduleResponders(reqData), | ||
| Priority: og.Priority, | ||
| } | ||
|
|
||
| var result AlertResult | ||
| var result CreateAlertResult | ||
| resp, err := og.client.NewRequest(). | ||
| SetContext(ctx). | ||
| SetBody(body). | ||
|
|
@@ -153,20 +160,60 @@ func (og Client) CreateAlert(ctx context.Context, reqID string, reqData RequestD | |
| if resp.IsError() { | ||
| return OpsgenieData{}, errWrapper(resp.StatusCode(), string(resp.Body())) | ||
| } | ||
|
|
||
| // If this fails, Teleport request approval and auto-approval will still work, | ||
| // but incident in Opsgenie won't be auto-closed or updated as the alertID won't be available. | ||
| alertRequestResult, err := og.tryGetAlertRequestResult(ctx, result.RequestID) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So what happens if this fails for some reason? I wonder if it would be more robust to move this logic to request resolution time. So we would store
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have been trying out implementing this this morning and I think it should work but it gets a bit messy now having to double check everywhere we interact with an alert whether or not we have the alert or the request id and resolving it if appropriate. I think we might end up having to change some of the shared plugin code as well if we go this route to allow the accessrequest/MessageData stuff to represent a 'message' that we dont have the ID of yet
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the event this fails it could go one of two ways depending on if the request ever does end up completing successfully after the retries fail.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The auto-approval will still work in both of these cases, right? As long as we're able to determine if requester is on-call. Cause we never interact with alert/request ID to actually approve Teleport's request - just the alert will not be automatically closed in Opsgenie, correct?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the notify error in onPendingRequest in opsgenie/app.go and the error from trying to resolve the request are put into an aggregate error instead of returning early.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, in that case, let's do this:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added in the comment and upped the timeout, does this look okay now?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I already approved so let's get this merged and do the backports. |
||
| if err != nil { | ||
| return OpsgenieData{}, trace.Wrap(err) | ||
| } | ||
|
|
||
| return OpsgenieData{ | ||
| AlertID: result.Alert.ID, | ||
| AlertID: alertRequestResult.Data.AlertID, | ||
| }, nil | ||
| } | ||
|
|
||
| func (og Client) getResponders(reqData RequestData) []Responder { | ||
| func (og Client) tryGetAlertRequestResult(ctx context.Context, reqID string) (GetAlertRequestResult, error) { | ||
| backoff := backoff.NewDecorr(ResolveAlertRequestRetryInterval, ResolveAlertRequestRetryTimeout, clockwork.NewRealClock()) | ||
| for { | ||
| alertRequestResult, err := og.getAlertRequestResult(ctx, reqID) | ||
| if err == nil { | ||
| logger.Get(ctx).Debugf("Got alert request result: %+v", alertRequestResult) | ||
| return alertRequestResult, nil | ||
| } | ||
| logger.Get(ctx).Debug("Failed to get alert request result:", err) | ||
| if err := backoff.Do(ctx); err != nil { | ||
| return GetAlertRequestResult{}, trace.Wrap(err) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func (og Client) getAlertRequestResult(ctx context.Context, reqID string) (GetAlertRequestResult, error) { | ||
| var result GetAlertRequestResult | ||
| resp, err := og.client.NewRequest(). | ||
| SetContext(ctx). | ||
| SetResult(&result). | ||
| SetPathParams(map[string]string{"requestID": reqID}). | ||
| Get("v2/alerts/requests/{requestID}") | ||
| if err != nil { | ||
| return GetAlertRequestResult{}, trace.Wrap(err) | ||
| } | ||
| defer resp.RawResponse.Body.Close() | ||
| if resp.IsError() { | ||
| return GetAlertRequestResult{}, errWrapper(resp.StatusCode(), string(resp.Body())) | ||
| } | ||
| return result, nil | ||
| } | ||
|
|
||
| func (og Client) getScheduleResponders(reqData RequestData) []Responder { | ||
| schedules := og.DefaultSchedules | ||
| if reqSchedules, ok := reqData.SystemAnnotations[types.TeleportNamespace+types.ReqAnnotationSchedulesLabel]; ok { | ||
| if reqSchedules, ok := reqData.SystemAnnotations[types.TeleportNamespace+types.ReqAnnotationNotifySchedulesLabel]; ok { | ||
| schedules = reqSchedules | ||
| } | ||
| responders := make([]Responder, 0, len(schedules)) | ||
| for _, s := range schedules { | ||
| responders = append(responders, Responder{ | ||
| Type: "schedule", | ||
| Type: ResponderTypeSchedule, | ||
| ID: s, | ||
| }) | ||
| } | ||
|
|
@@ -231,12 +278,13 @@ func (og Client) GetOnCall(ctx context.Context, scheduleName string) (Responders | |
| SetContext(ctx). | ||
| SetPathParams(map[string]string{"scheduleName": scheduleName}). | ||
| SetQueryParams(map[string]string{ | ||
| // This is required to lookup schedules by name (as opposed to lookup by ID) | ||
| "scheduleIdentifierType": "name", | ||
| // When flat is enabled it returns the email addresses of on-call participants. | ||
| "flat": "true", | ||
| }). | ||
| SetResult(&result). | ||
| Post("v2/schedules/{scheduleName}/on-calls") | ||
| Get("v2/schedules/{scheduleName}/on-calls") | ||
| if err != nil { | ||
| return RespondersResult{}, trace.Wrap(err) | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.