@@ -84,6 +84,7 @@ async def run(self, states: List[State]):
8484 alert = self .open_alerts .get (alert_identifier )
8585 if alert is None :
8686 self .open_alerts [alert_identifier ] = {
87+ "type" : check .__class__ .__name__ ,
8788 "window_start" : current_time .isoformat (),
8889 "failures" : 1 ,
8990 "last_window_failures" : None ,
@@ -175,21 +176,31 @@ async def process_zenduty_events(self, current_time):
175176
176177 for identifier , info in self .open_alerts .items ():
177178 self .check_zd_alert_status (identifier , current_time )
178- # Resolve the alert if raised and failed < 5 times in the last 5m window
179+ check_config = self .config ["checks" ]["global" ][info ["type" ]]
180+ alert_threshold = check_config .get ("zenduty_alert_threshold" , 5 )
181+ resolution_threshold = check_config .get ("zenduty_resolution_threshold" , 3 )
182+ # Resolve the alert if raised and failed < $threshold times in the last 5m window
183+ resolved = False
179184 if (
180- info ["sent" ]
181- and info ["last_window_failures" ] is not None
182- and info ["last_window_failures" ] < 5
185+ info ["last_window_failures" ] is not None
186+ and info ["last_window_failures" ] <= resolution_threshold
183187 ):
184188 logger .debug (f"Resolving Zenduty alert { identifier } " )
185- response = await send_zenduty_alert (
186- identifier , identifier , resolved = True
187- )
188- if response and 200 <= response .status < 300 :
189+ resolved = True
190+ if info ["sent" ]:
191+ response = await send_zenduty_alert (
192+ identifier , identifier , resolved = True
193+ )
194+ if response and 200 <= response .status < 300 :
195+ to_remove .append (identifier )
196+ else :
189197 to_remove .append (identifier )
190- # Raise alert if failed > 5 times within the last 5m window
191- # re-alert every 5 minutes
192- elif info ["failures" ] >= 5 and (
198+ # Raise alert if failed > $threshold times within the last 5m window
199+ # or if already alerted and not yet resolved.
200+ # Re-alert every 5 minutes but not more often.
201+ elif (
202+ info ["failures" ] >= alert_threshold or (info ["sent" ] and not resolved )
203+ ) and (
193204 not info .get ("last_alert" )
194205 or current_time - datetime .fromisoformat (info ["last_alert" ])
195206 > timedelta (minutes = 5 )
0 commit comments