1+ import json
12import logging
23import threading
34import time
78from cachebox import BaseCacheImpl , LRUCache
89from google .protobuf .json_format import MessageToDict
910from google .protobuf .struct_pb2 import Struct
11+ from grpc import ChannelConnectivity
1012
1113from openfeature .evaluation_context import EvaluationContext
1214from openfeature .event import ProviderEventDetails
@@ -47,6 +49,7 @@ def __init__(
4749 [ProviderEventDetails ], None
4850 ],
4951 ):
52+ self .active = False
5053 self .config = config
5154 self .emit_provider_ready = emit_provider_ready
5255 self .emit_provider_error = emit_provider_error
@@ -57,26 +60,50 @@ def __init__(
5760 if self .config .cache == CacheType .LRU
5861 else None
5962 )
60- self . stub , self . channel = self . _create_stub ()
61- self . retry_backoff_seconds = config .retry_backoff_ms * 0.001
62- self . retry_backoff_max_seconds = config .retry_backoff_max_ms * 0.001
63- self .retry_grace_attempts = config .retry_grace_attempts
63+
64+ retry_backoff_seconds = config .retry_backoff_ms * 0.001
65+ retry_backoff_max_seconds = config .retry_backoff_max_ms * 0.001
66+ self .retry_grace_period = config .retry_grace_period
6467 self .streamline_deadline_seconds = config .stream_deadline_ms * 0.001
6568 self .deadline = config .deadline_ms * 0.001
6669 self .connected = False
67-
68- def _create_stub (
69- self ,
70- ) -> typing .Tuple [evaluation_pb2_grpc .ServiceStub , grpc .Channel ]:
71- config = self .config
7270 channel_factory = grpc .secure_channel if config .tls else grpc .insecure_channel
73- channel = channel_factory (
71+ service_config = {
72+ "methodConfig" : [
73+ {
74+ "name" : [
75+ {
76+ "service" : "flagd.evaluation.v1.Service" ,
77+ "method" : "EventStream" ,
78+ }
79+ ],
80+ "retryPolicy" : {
81+ "maxAttempts" : 50000 , # Max value for a 32-bit integer
82+ "initialBackoff" : f"{ retry_backoff_seconds } s" , # Initial backoff delay
83+ "maxBackoff" : f"{ retry_backoff_max_seconds } s" , # Maximum backoff delay
84+ "backoffMultiplier" : 2 , # Exponential backoff multiplier
85+ "retryableStatusCodes" : [
86+ "UNAVAILABLE" ,
87+ "UNKNOWN" ,
88+ ], # Retry on these statuses
89+ },
90+ }
91+ ],
92+ }
93+
94+ # Create the channel with the service config
95+ options = [
96+ ("grpc.service_config" , json .dumps (service_config )),
97+ ("grpc.keepalive_time_ms" , config .keep_alive_time ),
98+ ]
99+ self .channel = channel_factory (
74100 f"{ config .host } :{ config .port } " ,
75- options = (( "grpc.keepalive_time_ms" , config . keep_alive_time ),) ,
101+ options = options ,
76102 )
77- stub = evaluation_pb2_grpc .ServiceStub (channel )
103+ self . stub = evaluation_pb2_grpc .ServiceStub (self . channel )
78104
79- return stub , channel
105+ self .thread : typing .Optional [threading .Thread ] = None
106+ self .timer : typing .Optional [threading .Timer ] = None
80107
81108 def initialize (self , evaluation_context : EvaluationContext ) -> None :
82109 self .connect ()
@@ -89,11 +116,12 @@ def shutdown(self) -> None:
89116
90117 def connect (self ) -> None :
91118 self .active = True
92- self .thread = threading .Thread (
93- target = self .listen , daemon = True , name = "FlagdGrpcServiceWorkerThread"
94- )
95- self .thread .start ()
96119
120+ # Run monitoring in a separate thread
121+ self .monitor_thread = threading .Thread (
122+ target = self .monitor , daemon = True , name = "FlagdGrpcServiceMonitorThread"
123+ )
124+ self .monitor_thread .start ()
97125 ## block until ready or deadline reached
98126 timeout = self .deadline + time .time ()
99127 while not self .connected and time .time () < timeout :
@@ -105,81 +133,81 @@ def connect(self) -> None:
105133 "Blocking init finished before data synced. Consider increasing startup deadline to avoid inconsistent evaluations."
106134 )
107135
136+ def monitor (self ):
137+ def state_change_callback (new_state : ChannelConnectivity ):
138+ logger .debug (f"gRPC state change: { new_state } " )
139+ if new_state == ChannelConnectivity .READY :
140+ if not self .thread or not self .thread .is_alive ():
141+ self .thread = threading .Thread (
142+ target = self .listen ,
143+ daemon = True ,
144+ name = "FlagdGrpcServiceWorkerThread" ,
145+ )
146+ self .thread .start ()
147+
148+ if self .timer and self .timer .is_alive ():
149+ logger .debug ("gRPC error timer expired" )
150+ self .timer .cancel ()
151+
152+ elif new_state == ChannelConnectivity .TRANSIENT_FAILURE :
153+ # this is the failed reonnect attempt so we are going into stale
154+ self .emit_provider_stale (
155+ ProviderEventDetails (
156+ message = "gRPC sync disconnected, reconnecting" ,
157+ )
158+ )
159+ # adding a timer, so we can emit the error event after time
160+ self .timer = threading .Timer (self .retry_grace_period , self .emit_error )
161+
162+ logger .debug ("gRPC error timer started" )
163+ self .timer .start ()
164+ self .connected = False
165+
166+ self .channel .subscribe (state_change_callback , try_to_connect = True )
167+
168+ def emit_error (self ) -> None :
169+ logger .debug ("gRPC error emitted" )
170+ if self .cache :
171+ self .cache .clear ()
172+ self .emit_provider_error (
173+ ProviderEventDetails (
174+ message = "gRPC sync disconnected, reconnecting" ,
175+ error_code = ErrorCode .GENERAL ,
176+ )
177+ )
178+
108179 def listen (self ) -> None :
109- retry_delay = self . retry_backoff_seconds
180+ logger . info ( "gRPC starting listener thread" )
110181 call_args = (
111182 {"timeout" : self .streamline_deadline_seconds }
112183 if self .streamline_deadline_seconds > 0
113184 else {}
114185 )
115- retry_counter = 0
116- while self .active :
117- request = evaluation_pb2 .EventStreamRequest ()
186+ request = evaluation_pb2 .EventStreamRequest ()
118187
188+ # defining a never ending loop to recreate the stream
189+ while self .active :
119190 try :
120- logger .debug ("Setting up gRPC sync flags connection" )
191+ logger .info ("Setting up gRPC sync flags connection" )
121192 for message in self .stub .EventStream (request , ** call_args ):
122193 if message .type == "provider_ready" :
123- if not self .connected :
124- self .emit_provider_ready (
125- ProviderEventDetails (
126- message = "gRPC sync connection established"
127- )
194+ self .connected = True
195+ self .emit_provider_ready (
196+ ProviderEventDetails (
197+ message = "gRPC sync connection established"
128198 )
129- self .connected = True
130- retry_counter = 0
131- # reset retry delay after successsful read
132- retry_delay = self .retry_backoff_seconds
133-
199+ )
134200 elif message .type == "configuration_change" :
135201 data = MessageToDict (message )["data" ]
136202 self .handle_changed_flags (data )
137-
138- if not self .active :
139- logger .info ("Terminating gRPC sync thread" )
140- return
141- except grpc .RpcError as e :
142- logger .error (f"SyncFlags stream error, { e .code ()= } { e .details ()= } " )
143- # re-create the stub if there's a connection issue - otherwise reconnect does not work as expected
144- self .stub , self .channel = self ._create_stub ()
203+ except grpc .RpcError as e : # noqa: PERF203
204+ # although it seems like this error log is not interesting, without it, the retry is not working as expected
205+ logger .debug (f"SyncFlags stream error, { e .code ()= } { e .details ()= } " )
145206 except ParseError :
146207 logger .exception (
147208 f"Could not parse flag data using flagd syntax: { message = } "
148209 )
149210
150- self .connected = False
151- self .on_connection_error (retry_counter , retry_delay )
152-
153- retry_delay = self .handle_retry (retry_counter , retry_delay )
154-
155- retry_counter = retry_counter + 1
156-
157- def handle_retry (self , retry_counter : int , retry_delay : float ) -> float :
158- if retry_counter == 0 :
159- logger .info ("gRPC sync disconnected, reconnecting immediately" )
160- else :
161- logger .info (f"gRPC sync disconnected, reconnecting in { retry_delay } s" )
162- time .sleep (retry_delay )
163- retry_delay = min (1.1 * retry_delay , self .retry_backoff_max_seconds )
164- return retry_delay
165-
166- def on_connection_error (self , retry_counter : int , retry_delay : float ) -> None :
167- if retry_counter == self .retry_grace_attempts :
168- if self .cache :
169- self .cache .clear ()
170- self .emit_provider_error (
171- ProviderEventDetails (
172- message = f"gRPC sync disconnected, reconnecting in { retry_delay } s" ,
173- error_code = ErrorCode .GENERAL ,
174- )
175- )
176- elif retry_counter == 1 :
177- self .emit_provider_stale (
178- ProviderEventDetails (
179- message = f"gRPC sync disconnected, reconnecting in { retry_delay } s" ,
180- )
181- )
182-
183211 def handle_changed_flags (self , data : typing .Any ) -> None :
184212 changed_flags = list (data ["flags" ].keys ())
185213
0 commit comments