-
-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(processor): Create strategy lazily on first message #317
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,10 +63,6 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: | |
return decorator | ||
|
||
|
||
class InvalidStateError(RuntimeError): | ||
pass | ||
|
||
|
||
ConsumerTiming = Literal[ | ||
"arroyo.consumer.poll.time", | ||
"arroyo.consumer.processing.time", | ||
|
@@ -210,23 +206,6 @@ def _close_strategy() -> None: | |
|
||
self.__metrics_buffer.incr_timing("arroyo.consumer.shutdown.time", value) | ||
|
||
def _create_strategy(partitions: Mapping[Partition, int]) -> None: | ||
start_create = time.time() | ||
|
||
self.__processing_strategy = ( | ||
self.__processor_factory.create_with_partitions( | ||
self.__commit, partitions | ||
) | ||
) | ||
|
||
self.__metrics_buffer.metrics.timing( | ||
"arroyo.consumer.run.create_strategy", time.time() - start_create | ||
) | ||
|
||
logger.debug( | ||
"Initialized processing strategy: %r", self.__processing_strategy | ||
) | ||
|
||
@_rdkafka_callback(metrics=self.__metrics_buffer) | ||
def on_partitions_assigned(partitions: Mapping[Partition, int]) -> None: | ||
logger.info("New partitions assigned: %r", partitions) | ||
|
@@ -243,7 +222,6 @@ def on_partitions_assigned(partitions: Mapping[Partition, int]) -> None: | |
"Partition assignment while processing strategy active" | ||
) | ||
_close_strategy() | ||
_create_strategy(partitions) | ||
|
||
@_rdkafka_callback(metrics=self.__metrics_buffer) | ||
def on_partitions_revoked(partitions: Sequence[Partition]) -> None: | ||
|
@@ -256,24 +234,6 @@ def on_partitions_revoked(partitions: Sequence[Partition]) -> None: | |
if partitions: | ||
_close_strategy() | ||
|
||
# Recreate the strategy if the consumer still has other partitions | ||
# assigned and is not closed or errored | ||
try: | ||
current_partitions = self.__consumer.tell() | ||
if len(current_partitions.keys() - set(partitions)): | ||
active_partitions = { | ||
partition: offset | ||
for partition, offset in current_partitions.items() | ||
if partition not in partitions | ||
} | ||
logger.info( | ||
"Recreating strategy since there are still active partitions: %r", | ||
active_partitions, | ||
) | ||
_create_strategy(active_partitions) | ||
except RuntimeError: | ||
pass | ||
|
||
Comment on lines
-259
to
-276
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a note that we don't get a subset of partitions revoked today as we are using eager rebalancing. But that may change in the future. This code was written this way so it also works with incremental rebalancing, even though we never actually recreate the strategy on partitions revoked. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes i think we're now still handling that case correctly. this code effectively moved from the callback into just the condition under which we create the strategy changed. before, we'd recreate the strategy if we still had partitions assigned, now we're creating the strategy if we have none and |
||
# Partition revocation can happen anytime during the consumer lifecycle and happen | ||
# multiple times. What we want to know is that the consumer is not stuck somewhere. | ||
# The presence of this message as the last message of a consumer | ||
|
@@ -310,6 +270,19 @@ def __commit(self, offsets: Mapping[Partition, int], force: bool = False) -> Non | |
) | ||
self.__commit_policy_state.did_commit(now, offsets) | ||
|
||
def __create_strategy(self, partitions: Mapping[Partition, int]) -> None: | ||
start_create = time.time() | ||
|
||
self.__processing_strategy = self.__processor_factory.create_with_partitions( | ||
self.__commit, partitions | ||
) | ||
|
||
self.__metrics_buffer.metrics.timing( | ||
"arroyo.consumer.run.create_strategy", time.time() - start_create | ||
) | ||
|
||
logger.debug("Initialized processing strategy: %r", self.__processing_strategy) | ||
|
||
def run(self) -> None: | ||
"The main run loop, see class docstring for more information." | ||
|
||
|
@@ -387,6 +360,10 @@ def _run_once(self) -> None: | |
except RecoverableError: | ||
return | ||
|
||
if self.__processing_strategy is None and self.__message is not None: | ||
current_offsets = self.__consumer.tell() | ||
self.__create_strategy(current_offsets) | ||
|
||
if self.__processing_strategy is not None: | ||
start_poll = time.time() | ||
try: | ||
|
@@ -398,58 +375,59 @@ def _run_once(self) -> None: | |
self.__metrics_buffer.incr_timing( | ||
"arroyo.consumer.processing.time", time.time() - start_poll | ||
) | ||
if self.__message is not None: | ||
try: | ||
start_submit = time.time() | ||
message = ( | ||
Message(self.__message) if self.__message is not None else None | ||
) | ||
self.__processing_strategy.submit(message) | ||
|
||
self.__metrics_buffer.incr_timing( | ||
"arroyo.consumer.processing.time", | ||
time.time() - start_submit, | ||
) | ||
except MessageRejected as e: | ||
# If the processing strategy rejected our message, we need | ||
# to pause the consumer and hold the message until it is | ||
# accepted, at which point we can resume consuming. | ||
# if not message_carried_over: | ||
if self.__backpressure_timestamp is None: | ||
self.__backpressure_timestamp = time.time() | ||
|
||
elif not self.__is_paused and ( | ||
time.time() - self.__backpressure_timestamp > 1 | ||
): | ||
logger.debug( | ||
"Caught %r while submitting %r, pausing consumer...", | ||
e, | ||
self.__message, | ||
) | ||
self.__consumer.pause([*self.__consumer.tell().keys()]) | ||
self.__is_paused = True | ||
|
||
else: | ||
time.sleep(0.01) | ||
if self.__message is not None: | ||
# in a previous if-stmt, we have ensured that | ||
# self.__processing_strategy is available if self.__message is not | ||
# None | ||
assert self.__processing_strategy is not None | ||
|
||
except InvalidMessage as e: | ||
self._handle_invalid_message(e) | ||
try: | ||
start_submit = time.time() | ||
message = ( | ||
Message(self.__message) if self.__message is not None else None | ||
) | ||
self.__processing_strategy.submit(message) | ||
|
||
else: | ||
# Resume if we are currently in a paused state | ||
if self.__is_paused: | ||
self.__consumer.resume([*self.__consumer.tell().keys()]) | ||
self.__is_paused = False | ||
|
||
# Clear backpressure timestamp if it is set | ||
self._clear_backpressure() | ||
|
||
self.__message = None | ||
else: | ||
if self.__message is not None: | ||
raise InvalidStateError( | ||
"received message without active processing strategy" | ||
self.__metrics_buffer.incr_timing( | ||
"arroyo.consumer.processing.time", | ||
time.time() - start_submit, | ||
) | ||
except MessageRejected as e: | ||
# If the processing strategy rejected our message, we need | ||
# to pause the consumer and hold the message until it is | ||
# accepted, at which point we can resume consuming. | ||
# if not message_carried_over: | ||
if self.__backpressure_timestamp is None: | ||
self.__backpressure_timestamp = time.time() | ||
|
||
elif not self.__is_paused and ( | ||
time.time() - self.__backpressure_timestamp > 1 | ||
): | ||
logger.debug( | ||
"Caught %r while submitting %r, pausing consumer...", | ||
e, | ||
self.__message, | ||
) | ||
self.__consumer.pause([*self.__consumer.tell().keys()]) | ||
self.__is_paused = True | ||
|
||
else: | ||
time.sleep(0.01) | ||
|
||
except InvalidMessage as e: | ||
self._handle_invalid_message(e) | ||
|
||
else: | ||
# Resume if we are currently in a paused state | ||
if self.__is_paused: | ||
self.__consumer.resume([*self.__consumer.tell().keys()]) | ||
self.__is_paused = False | ||
|
||
# Clear backpressure timestamp if it is set | ||
self._clear_backpressure() | ||
|
||
self.__message = None | ||
|
||
def signal_shutdown(self) -> None: | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we assert that
self.__processing_strategy
is None here, or make sure we close if it is not None? What would happen if we get an assign without a prior revocation? I know that doesn't actually happen today but the code right now is designed to work for incremental assignment as well, which might not be the case after this change.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not sure i follow, we do close it on the line before exactly when the strategy is not None
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fwiw this part of rebalancing was actually broken with incremental assign, as we would create the strategy with the subset of partitions from this callback instead of the full assignment. with this pr we're now actually only calling
_create_strategy
withconsumer.tell()
which should always contain the full assignment