-
Notifications
You must be signed in to change notification settings - Fork 101
fix: update retry strategy for mutation calls to handle aborted transactions #1279
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
01a0196
2a9b805
c634bdb
a6e25a3
198c7df
1032b8b
d4e7d9c
fa8ae71
d5c4975
7f3088c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,11 +27,15 @@ | |
| from google.protobuf.internal.enum_type_wrapper import EnumTypeWrapper | ||
|
|
||
| from google.api_core import datetime_helpers | ||
| from google.api_core.exceptions import Aborted | ||
| from google.cloud._helpers import _date_from_iso8601_date | ||
| from google.cloud.spanner_v1 import TypeCode | ||
| from google.cloud.spanner_v1 import ExecuteSqlRequest | ||
| from google.cloud.spanner_v1 import JsonObject | ||
| from google.cloud.spanner_v1.request_id_header import with_request_id | ||
| from google.rpc.error_details_pb2 import RetryInfo | ||
|
|
||
| import random | ||
|
|
||
| # Validation error messages | ||
| NUMERIC_MAX_SCALE_ERR_MSG = ( | ||
|
|
@@ -466,13 +470,19 @@ def _retry( | |
| delay=2, | ||
| allowed_exceptions=None, | ||
| beforeNextRetry=None, | ||
| deadline=None, | ||
| ): | ||
| """ | ||
| Retry a function with a specified number of retries, delay between retries, and list of allowed exceptions. | ||
| Retry a specified function with different logic based on the type of exception raised. | ||
|
|
||
| If the exception is of type google.api_core.exceptions.Aborted, | ||
| apply an alternate retry strategy that relies on the provided deadline value instead of a fixed number of retries. | ||
| For all other exceptions, retry the function up to a specified number of times. | ||
|
|
||
| Args: | ||
| func: The function to be retried. | ||
| retry_count: The maximum number of times to retry the function. | ||
| deadline: This will be used in case of Aborted transactions. | ||
|
||
| delay: The delay in seconds between retries. | ||
| allowed_exceptions: A tuple of exceptions that are allowed to occur without triggering a retry. | ||
| Passing allowed_exceptions as None will lead to retrying for all exceptions. | ||
|
|
@@ -481,13 +491,21 @@ def _retry( | |
| The result of the function if it is successful, or raises the last exception if all retries fail. | ||
| """ | ||
| retries = 0 | ||
| while retries <= retry_count: | ||
| while True: | ||
|
||
| if retries > 0 and beforeNextRetry: | ||
| beforeNextRetry(retries, delay) | ||
|
|
||
| try: | ||
| return func() | ||
| except Exception as exc: | ||
| if isinstance(exc, Aborted) and deadline is not None: | ||
| if ( | ||
| allowed_exceptions is not None | ||
| and allowed_exceptions.get(exc.__class__) is not None | ||
| ): | ||
| retries += 1 | ||
| _delay_until_retry(exc, deadline=deadline, attempts=retries) | ||
| continue | ||
| if ( | ||
| allowed_exceptions is None or exc.__class__ in allowed_exceptions | ||
| ) and retries < retry_count: | ||
|
|
@@ -529,6 +547,61 @@ def _metadata_with_leader_aware_routing(value, **kw): | |
| return ("x-goog-spanner-route-to-leader", str(value).lower()) | ||
|
|
||
|
|
||
| def _delay_until_retry(exc, deadline, attempts): | ||
| """Helper for :meth:`Session.run_in_transaction`. | ||
|
|
||
| Detect retryable abort, and impose server-supplied delay. | ||
|
|
||
| :type exc: :class:`google.api_core.exceptions.Aborted` | ||
| :param exc: exception for aborted transaction | ||
|
|
||
| :type deadline: float | ||
| :param deadline: maximum timestamp to continue retrying the transaction. | ||
|
|
||
| :type attempts: int | ||
| :param attempts: number of call retries | ||
| """ | ||
|
|
||
| cause = exc.errors[0] | ||
| now = time.time() | ||
| if now >= deadline: | ||
| raise | ||
|
|
||
| delay = _get_retry_delay(cause, attempts) | ||
| print(now, delay, deadline) | ||
|
||
| if delay is not None: | ||
| if now + delay > deadline: | ||
| raise | ||
|
|
||
| time.sleep(delay) | ||
|
|
||
|
|
||
| def _get_retry_delay(cause, attempts): | ||
| """Helper for :func:`_delay_until_retry`. | ||
|
|
||
| :type exc: :class:`grpc.Call` | ||
| :param exc: exception for aborted transaction | ||
|
|
||
| :rtype: float | ||
| :returns: seconds to wait before retrying the transaction. | ||
|
|
||
| :type attempts: int | ||
| :param attempts: number of call retries | ||
| """ | ||
| if hasattr(cause, "trailing_metadata"): | ||
| metadata = dict(cause.trailing_metadata()) | ||
| else: | ||
| metadata = {} | ||
| retry_info_pb = metadata.get("google.rpc.retryinfo-bin") | ||
| if retry_info_pb is not None: | ||
| retry_info = RetryInfo() | ||
| retry_info.ParseFromString(retry_info_pb) | ||
| nanos = retry_info.retry_delay.nanos | ||
| return retry_info.retry_delay.seconds + nanos / 1.0e9 | ||
|
|
||
| return 2**attempts + random.random() | ||
|
|
||
|
|
||
| class AtomicCounter: | ||
| def __init__(self, start_value=0): | ||
| self.__lock = threading.Lock() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,10 @@ | |
| from google.cloud.spanner_v1._helpers import _retry | ||
| from google.cloud.spanner_v1._helpers import _check_rst_stream_error | ||
| from google.api_core.exceptions import InternalServerError | ||
| from google.api_core.exceptions import Aborted | ||
| import time | ||
|
|
||
| DEFAULT_RETRY_TIMEOUT_SECS = 30 | ||
|
|
||
|
|
||
| class _BatchBase(_SessionWrapper): | ||
|
|
@@ -162,6 +166,7 @@ def commit( | |
| request_options=None, | ||
| max_commit_delay=None, | ||
| exclude_txn_from_change_streams=False, | ||
| **kwargs, | ||
| ): | ||
| """Commit mutations to the database. | ||
|
|
||
|
|
@@ -227,9 +232,16 @@ def commit( | |
| request=request, | ||
| metadata=metadata, | ||
| ) | ||
| deadline = time.time() + kwargs.get( | ||
| "timeout_secs", DEFAULT_RETRY_TIMEOUT_SECS | ||
| ) | ||
| response = _retry( | ||
| method, | ||
| allowed_exceptions={InternalServerError: _check_rst_stream_error}, | ||
| allowed_exceptions={ | ||
| InternalServerError: _check_rst_stream_error, | ||
| Aborted: no_op_handler, | ||
| }, | ||
| deadline=deadline, | ||
| ) | ||
| self.committed = response.commit_timestamp | ||
| self.commit_stats = response.commit_stats | ||
|
|
@@ -293,7 +305,9 @@ def group(self): | |
| self._mutation_groups.append(mutation_group) | ||
| return MutationGroup(self._session, mutation_group.mutations) | ||
|
|
||
| def batch_write(self, request_options=None, exclude_txn_from_change_streams=False): | ||
| def batch_write( | ||
|
||
| self, request_options=None, exclude_txn_from_change_streams=False, **kwargs | ||
| ): | ||
| """Executes batch_write. | ||
|
|
||
| :type request_options: | ||
|
|
@@ -346,9 +360,16 @@ def batch_write(self, request_options=None, exclude_txn_from_change_streams=Fals | |
| request=request, | ||
| metadata=metadata, | ||
| ) | ||
| deadline = time.time() + kwargs.get( | ||
| "timeout_secs", DEFAULT_RETRY_TIMEOUT_SECS | ||
| ) | ||
| response = _retry( | ||
| method, | ||
| allowed_exceptions={InternalServerError: _check_rst_stream_error}, | ||
| allowed_exceptions={ | ||
| InternalServerError: _check_rst_stream_error, | ||
| Aborted: no_op_handler, | ||
| }, | ||
| deadline=deadline, | ||
| ) | ||
| self.committed = True | ||
| return response | ||
|
|
@@ -372,3 +393,8 @@ def _make_write_pb(table, columns, values): | |
| return Mutation.Write( | ||
| table=table, columns=columns, values=_make_list_value_pbs(values) | ||
| ) | ||
|
|
||
|
|
||
| def no_op_handler(exc): | ||
| # No-op (does nothing) | ||
| pass | ||
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This API is not really logical. I would suggest splitting this into two separate functions:
_retry_on_aborted_exceptionthat handles that specific case.In the current form, the API is quite 'magical' and hard to understand. What is for example the definition of this function if you call it with
Abortedas one of the allowed exceptions? Will it use the specific logic forAbortedin all cases? Or only if you have also supplied a deadline? What is the meaning ofretry_countif you use to it retryAbortederrors? etc...Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the exception is of type Aborted, it will activate the custom retry strategy. However, this will only occur if the user has listed this exception in the
allowed_exceptions map and provided adeadlinevalue. If either condition is missing, the exception will not be retried. For the batch API use case, we will specifically allow this exception to be retried.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant that the
_retryfunction and the_retry_on_aborted_exceptionshould be completely separated. I don't really see any advantage of combining them, as the actual code that can be shared is minimal, and the API surface of this function is not logical.E.g. if you have defined
Abortedas a retryable exception, but you forget to supply a deadline, then all of a sudden it is not retriable. Also, deadline is only used if you add Aborted as a possible retryable error, and is otherwise ignored if you only supply other error codes. Same with retry_count; it is only used for non-Aborted errors. The fact that there are many combinations of input arguments that don't make any sense, is an indication that the function itself should be split.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the clarification. I've implemented the new retry logic as suggested, separating the
_retryand_retry_on_aborted_exceptionfunctions. This ensures clearer logic, as combining them led to confusing combinations of parameters that didn't make sense. Now, the retry logic for non-Aborted and Aborted exceptions is more distinct and easier to manage.