Skip to content

Commit

Permalink
A little more rate limit tweaking for v0.4.4 (#142)
Browse files Browse the repository at this point in the history
This adds a bunch of little tweaks related to the recent rate limit work. It should be more-or-less ready for release after this.

**The main thing** is adding a 60-second delay for 429 (rate limit) responses. Looking back at the docstring for the `RateLimitError` class, I think I might have originally *intended* to always raise instead of retry these, but git history shows it never actually worked that way. I want to get a patch release out with the new retry timings and logs, but patch release doesn’t seem like a good time change whether a response gets retried, so I’ve just made a longer delay here for now.

Then some other assorted bits and bobs:

- Downgrades the log level when we catch and retry exceptions (used to be WARN, now just INFO). The whole point of retry is to make this situations not special, so I don’t think it’s actually worth a warning.

- Actually handle all forms of `Retry-After` header, and move logic to `_utils`. We didn’t previously handle the date form of the header, just the integer form that indicated a number of seconds. Now we handle both. I also moved the logic out of the `exceptions` module since it was too much special logic (arguably this was already the case even before I improved the parsing).

- Fix the `total_time` calculation for `WaybackRetryError`. I’m not sure what I was thinking with the previous code, which was… not even sort-of correct.

- Add `read_and_close()` everywhere that we raise an exception. We did it in some spots, but had missed others.
  • Loading branch information
Mr0grog committed Nov 28, 2023
1 parent dbec4ce commit d3a982e
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 28 deletions.
1 change: 1 addition & 0 deletions docs/source/release-history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Fixes & Maintenance

- Adjusted default rate limits to work better with current throttling in use at archive.org. (:issue:`140`)
- Added more logging around requests are rate limiting. (:issue:`139`)
- Fix calculation of the ``time`` attribute on :class:`wayback.exceptions.WaybackRetryError`. It turns out it was only accounting for the time spent waiting between retries and skipping the time waiting for the server to respond. (:issue:`142`)


v0.4.3 (2023-09-26)
Expand Down
57 changes: 38 additions & 19 deletions wayback/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@
# Make sure it roughly starts with a valid protocol + domain + port?
URL_ISH = re.compile(r'^[\w+\-]+://[^/?=&]+\.\w\w+(:\d+)?(/|$)')

# How long to delay after a rate limit error if the response does not include a
# recommended retry time (e.g. via the `Retry-After` header).
DEFAULT_RATE_LIMIT_DELAY = 60


class Mode(Enum):
"""
Expand Down Expand Up @@ -395,40 +399,39 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
# NOTE: worth considering whether we should push this logic to a custom
# requests.adapters.HTTPAdapter
def send(self, *args, **kwargs):
total_time = 0
start_time = time.time()
maximum = self.retries
retries = 0
while True:
retry_delay = 0
try:
logger.debug('sending HTTP request %s "%s", %s', args[0].method, args[0].url, kwargs)
result = super().send(*args, **kwargs)
if retries >= maximum or not self.should_retry(result):
if result.status_code == 429:
raise RateLimitError(result)
return result
response = super().send(*args, **kwargs)
retry_delay = self.get_retry_delay(retries, response)

if retries >= maximum or not self.should_retry(response):
if response.status_code == 429:
read_and_close(response)
raise RateLimitError(response, retry_delay)
return response
else:
# TODO: parse and use Retry-After header if present.
# TODO: add additional delay for 429 responses.
logger.debug('Received error response (status: %s), will retry', result.status_code)
logger.debug('Received error response (status: %s), will retry', response.status_code)
read_and_close(response)
except WaybackSession.handleable_errors as error:
response = getattr(error, 'response', None)
if response:
if response is not None:
read_and_close(response)

if retries >= maximum:
raise WaybackRetryError(retries, total_time, error) from error
raise WaybackRetryError(retries, time.time() - start_time, error) from error
elif self.should_retry_error(error):
logger.warn('Caught exception during request, will retry: %s', error)
retry_delay = self.get_retry_delay(retries, response)
logger.info('Caught exception during request, will retry: %s', error)
else:
raise

# The first retry has no delay.
if retries > 0:
seconds = self.backoff * 2 ** (retries - 1)
total_time += seconds
logger.debug('Will retry after sleeping for %s seconds...', seconds)
time.sleep(seconds)

logger.debug('Will retry after sleeping for %s seconds...', retry_delay)
time.sleep(retry_delay)
retries += 1

# Customize `request` in order to set a default timeout from the session.
Expand Down Expand Up @@ -472,6 +475,22 @@ def should_retry_error(self, error):

return False

def get_retry_delay(self, retries, response=None):
delay = 0

# As of 2023-11-27, the Wayback Machine does not include a `Retry-After`
# header on 429 responses, so this parsing is just future-proofing.
if response is not None:
delay = _utils.parse_retry_after(response.headers.get('Retry-After')) or delay
if response.status_code == 429 and delay == 0:
delay = DEFAULT_RATE_LIMIT_DELAY

# No default backoff on the first retry.
if retries > 0:
delay = max(self.backoff * 2 ** (retries - 1), delay)

return delay

def reset(self):
"Reset any network connections the session is using."
# Close really just closes all the adapters in `self.adapters`. We
Expand Down
22 changes: 22 additions & 0 deletions wayback/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections.abc import Mapping, MutableMapping
from contextlib import contextmanager
from datetime import date, datetime, timezone
import email.utils
import logging
import re
import requests
Expand Down Expand Up @@ -86,6 +87,27 @@ def parse_timestamp(time_string):
.replace(tzinfo=timezone.utc))


def parse_retry_after(retry_after_header):
"""
Given a response object, return the recommended retry-after time in seconds
or ``None`` if there is no recommended timeframe. Returns ``0`` if the
time was in the past or could not be parsed.
"""
if isinstance(retry_after_header, str):
seconds = 0
try:
seconds = int(retry_after_header)
except ValueError:
retry_date_tuple = email.utils.parsedate_tz(retry_after_header)
if retry_date_tuple:
retry_date = email.utils.mktime_tz(retry_date_tuple)
seconds = retry_date - int(time.time())

return max(0, seconds)

return None


def ensure_utc_datetime(value):
"""
Given a datetime, date, or Wayback-style timestamp string, return an
Expand Down
10 changes: 2 additions & 8 deletions wayback/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,9 @@ class RateLimitError(WaybackException):
``None``.
"""

def __init__(self, response):
def __init__(self, response, retry_after):
self.response = response

# The Wayback Machine does not generally include a `Retry-After` header
# at the time of this writing, but this code is included in case they
# add it in the future. The standard recommends it:
# https://tools.ietf.org/html/rfc6585#section-4
retry_header = response.headers.get('Retry-After')
self.retry_after = int(retry_header) if retry_header else None
self.retry_after = retry_after

message = 'Wayback rate limit exceeded'
if self.retry_after:
Expand Down
25 changes: 24 additions & 1 deletion wayback/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime, timezone
import email.utils
import pytest
import time
from .._utils import memento_url_data, rate_limited
from .._utils import memento_url_data, rate_limited, parse_retry_after


class TestMementoUrlData:
Expand Down Expand Up @@ -75,3 +76,25 @@ def test_simultaneous_ratelimits(self):
with rate_limited(calls_per_second=3, group='sim2'):
pass
assert 1.66 <= time.time() - start_time <= 1.7


class TestParseRetryAfter:
def test_seconds(self):
assert 30 == parse_retry_after('30')

def test_date(self):
result = parse_retry_after(email.utils.formatdate(time.time() + 30))
assert 29 <= result <= 31

def test_no_input(self):
assert parse_retry_after(None) is None

def test_invalid_input(self):
assert 0 == parse_retry_after('hello!')

def test_negative_seconds(self):
assert 0 == parse_retry_after('-10')

def test_negative_date(self):
result = parse_retry_after(email.utils.formatdate(time.time() - 30))
assert 0 == result

0 comments on commit d3a982e

Please sign in to comment.