Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(alerts): Add ratelimiting for metric alerts #58032

Merged
merged 15 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion src/sentry/incidents/subscription_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

from django.conf import settings
from django.db import router, transaction
from django.utils import timezone
from snuba_sdk import Column, Condition, Limit, Op

from sentry import features
from sentry import features, options
from sentry.constants import CRASH_RATE_ALERT_AGGREGATE_ALIAS, CRASH_RATE_ALERT_SESSION_COUNT_ALIAS
from sentry.incidents.logic import (
CRITICAL_TRIGGER_LABEL,
Expand Down Expand Up @@ -550,8 +551,40 @@ def trigger_alert_threshold(
:return:
"""
self.trigger_alert_counts[trigger.id] += 1

if options.get("metric_alerts.rate_limit"):
# If an incident was created for this rule, trigger type, and subscription
# within the last 10 minutes, don't make another one
last_it = (
IncidentTrigger.objects.filter(alert_rule_trigger=trigger)
.order_by("-incident_id")
.select_related("incident")
.first()
)
last_incident: Incident | None = last_it.incident if last_it else None
last_incident_projects = (
[project.id for project in last_incident.projects.all()] if last_incident else []
)
minutes_since_last_incident = (
(timezone.now() - last_incident.date_added).seconds / 60 if last_incident else None
)
if (
last_incident
and self.subscription.project.id in last_incident_projects
and minutes_since_last_incident <= 10
):
metrics.incr(
"incidents.alert_rules.hit_rate_limit",
tags={
"last_incident_id": last_incident.id,
"project_id": self.subscription.project.id,
"trigger_id": trigger.id,
},
)
return None
if self.trigger_alert_counts[trigger.id] >= self.alert_rule.threshold_period:
metrics.incr("incidents.alert_rules.trigger", tags={"type": "fire"})

# Only create a new incident if we don't already have an active one
if not self.active_incident:
detected_at = self.calculate_event_date_from_update_date(self.last_update)
Expand Down
2 changes: 2 additions & 0 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -1636,3 +1636,5 @@
default=False,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)

register("metric_alerts.rate_limit", default=False, flags=FLAG_AUTOMATOR_MODIFIABLE)
73 changes: 73 additions & 0 deletions tests/sentry/incidents/test_subscription_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from sentry.snuba.dataset import Dataset
from sentry.snuba.models import QuerySubscription, SnubaQueryEventType
from sentry.testutils.cases import BaseMetricsTestCase, SnubaTestCase, TestCase
from sentry.testutils.helpers import override_options
from sentry.testutils.helpers.datetime import freeze_time, iso_format
from sentry.utils import json
from sentry.utils.dates import to_timestamp
Expand Down Expand Up @@ -167,6 +168,12 @@ def assert_trigger_counts(self, processor, trigger, alert_triggers=0, resolve_tr
def latest_activity(self, incident):
return IncidentActivity.objects.filter(incident=incident).order_by("-id").first()

def assert_incident_is_latest_for_rule(self, incident):
last_incident = (
Incident.objects.filter(alert_rule=incident.alert_rule).order_by("-date_added").first()
)
assert last_incident == incident


@freeze_time()
class ProcessUpdateTest(ProcessUpdateBaseClass):
Expand Down Expand Up @@ -2106,6 +2113,72 @@ def test_comparison_alert_different_aggregate(self):
incident, [self.action], [(150.0, IncidentStatus.CLOSED, mock.ANY)]
)

@override_options({"metric_alerts.rate_limit": True})
def test_no_new_incidents_within_ten_minutes(self):
# Verify that a new incident is not made for the same rule, trigger, and
# subscription if an incident was already made within the last 10 minutes.
rule = self.rule
trigger = self.trigger
processor = self.send_update(
rule, trigger.alert_threshold + 1, timedelta(minutes=-2), self.sub
)
self.assert_trigger_counts(processor, self.trigger, 0, 0)
original_incident = self.assert_active_incident(rule)
original_incident.update(date_added=original_incident.date_added - timedelta(minutes=10))
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.ACTIVE)

# resolve the trigger
self.send_update(rule, 6, timedelta(minutes=-1), subscription=self.sub)
self.assert_no_active_incident(rule)
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.RESOLVED)

# fire trigger again within 10 minutes; no new incident should be made
processor = self.send_update(rule, trigger.alert_threshold + 1, subscription=self.sub)
self.assert_trigger_counts(processor, self.trigger, 1, 0)
self.assert_no_active_incident(rule)
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.RESOLVED)
self.assert_incident_is_latest_for_rule(original_incident)
self.metrics.incr.assert_has_calls(
[
call(
"incidents.alert_rules.hit_rate_limit",
tags={
"last_incident_id": original_incident.id,
"project_id": self.sub.project.id,
"trigger_id": trigger.id,
},
),
],
any_order=True,
)

@override_options({"metric_alerts.rate_limit": True})
def test_incident_made_after_ten_minutes(self):
# Verify that a new incident will be made for the same rule, trigger, and
# subscription if the last incident made for those was made more tha 10 minutes
# ago
rule = self.rule
trigger = self.trigger
processor = self.send_update(
rule, trigger.alert_threshold + 1, timedelta(minutes=-2), self.sub
)
self.assert_trigger_counts(processor, self.trigger, 0, 0)
original_incident = self.assert_active_incident(rule)
original_incident.update(date_added=original_incident.date_added - timedelta(minutes=11))
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.ACTIVE)

# resolve the trigger
self.send_update(rule, 6, timedelta(minutes=-1), self.sub)
self.assert_no_active_incident(rule)
self.assert_trigger_exists_with_status(original_incident, trigger, TriggerStatus.RESOLVED)

# fire trigger again after more than 10 minutes have passed; a new incident should be made
processor = self.send_update(rule, trigger.alert_threshold + 1, subscription=self.sub)
self.assert_trigger_counts(processor, self.trigger, 0, 0)
new_incident = self.assert_active_incident(rule)
self.assert_trigger_exists_with_status(new_incident, trigger, TriggerStatus.ACTIVE)
self.assert_incident_is_latest_for_rule(new_incident)


class MetricsCrashRateAlertProcessUpdateTest(ProcessUpdateBaseClass, BaseMetricsTestCase):
@pytest.fixture(autouse=True)
Expand Down
Loading