Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A little more rate limit tweaking for v0.4.4 #142

Merged
merged 8 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release-history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Fixes & Maintenance

- Adjusted default rate limits to work better with current throttling in use at archive.org. (:issue:`140`)
- Added more logging around requests are rate limiting. (:issue:`139`)
- Fix calculation of the ``time`` attribute on :class:`wayback.exceptions.WaybackRetryError`. It turns out it was only accounting for the time spent waiting between retries and skipping the time waiting for the server to respond. (:issue:`142`)


v0.4.3 (2023-09-26)
Expand Down
57 changes: 38 additions & 19 deletions wayback/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@
# Make sure it roughly starts with a valid protocol + domain + port?
URL_ISH = re.compile(r'^[\w+\-]+://[^/?=&]+\.\w\w+(:\d+)?(/|$)')

# How long to delay after a rate limit error if the response does not include a
# recommended retry time (e.g. via the `Retry-After` header).
DEFAULT_RATE_LIMIT_DELAY = 60


class Mode(Enum):
"""
Expand Down Expand Up @@ -395,40 +399,39 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
# NOTE: worth considering whether we should push this logic to a custom
# requests.adapters.HTTPAdapter
def send(self, *args, **kwargs):
total_time = 0
start_time = time.time()
maximum = self.retries
retries = 0
while True:
retry_delay = 0
try:
logger.debug('sending HTTP request %s "%s", %s', args[0].method, args[0].url, kwargs)
result = super().send(*args, **kwargs)
if retries >= maximum or not self.should_retry(result):
if result.status_code == 429:
raise RateLimitError(result)
return result
response = super().send(*args, **kwargs)
retry_delay = self.get_retry_delay(retries, response)

if retries >= maximum or not self.should_retry(response):
if response.status_code == 429:
read_and_close(response)
raise RateLimitError(response, retry_delay)
return response
else:
# TODO: parse and use Retry-After header if present.
# TODO: add additional delay for 429 responses.
logger.debug('Received error response (status: %s), will retry', result.status_code)
logger.debug('Received error response (status: %s), will retry', response.status_code)
read_and_close(response)
except WaybackSession.handleable_errors as error:
response = getattr(error, 'response', None)
if response:
if response is not None:
read_and_close(response)

if retries >= maximum:
raise WaybackRetryError(retries, total_time, error) from error
raise WaybackRetryError(retries, time.time() - start_time, error) from error
elif self.should_retry_error(error):
logger.warn('Caught exception during request, will retry: %s', error)
retry_delay = self.get_retry_delay(retries, response)
logger.info('Caught exception during request, will retry: %s', error)
else:
raise

# The first retry has no delay.
if retries > 0:
seconds = self.backoff * 2 ** (retries - 1)
total_time += seconds
logger.debug('Will retry after sleeping for %s seconds...', seconds)
time.sleep(seconds)

logger.debug('Will retry after sleeping for %s seconds...', retry_delay)
time.sleep(retry_delay)
retries += 1

# Customize `request` in order to set a default timeout from the session.
Expand Down Expand Up @@ -472,6 +475,22 @@ def should_retry_error(self, error):

return False

def get_retry_delay(self, retries, response=None):
delay = 0

# As of 2023-11-27, the Wayback Machine does not include a `Retry-After`
# header on 429 responses, so this parsing is just future-proofing.
if response is not None:
delay = _utils.parse_retry_after(response.headers.get('Retry-After')) or delay
if response.status_code == 429 and delay == 0:
delay = DEFAULT_RATE_LIMIT_DELAY

# No default backoff on the first retry.
if retries > 0:
delay = max(self.backoff * 2 ** (retries - 1), delay)

return delay

def reset(self):
"Reset any network connections the session is using."
# Close really just closes all the adapters in `self.adapters`. We
Expand Down
22 changes: 22 additions & 0 deletions wayback/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections.abc import Mapping, MutableMapping
from contextlib import contextmanager
from datetime import date, datetime, timezone
import email.utils
import logging
import re
import requests
Expand Down Expand Up @@ -86,6 +87,27 @@ def parse_timestamp(time_string):
.replace(tzinfo=timezone.utc))


def parse_retry_after(retry_after_header):
"""
Given a response object, return the recommended retry-after time in seconds
or ``None`` if there is no recommended timeframe. Returns ``0`` if the
time was in the past or could not be parsed.
"""
if isinstance(retry_after_header, str):
seconds = 0
try:
seconds = int(retry_after_header)
except ValueError:
retry_date_tuple = email.utils.parsedate_tz(retry_after_header)
if retry_date_tuple:
retry_date = email.utils.mktime_tz(retry_date_tuple)
seconds = retry_date - int(time.time())

return max(0, seconds)

return None


def ensure_utc_datetime(value):
"""
Given a datetime, date, or Wayback-style timestamp string, return an
Expand Down
10 changes: 2 additions & 8 deletions wayback/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,9 @@ class RateLimitError(WaybackException):
``None``.
"""

def __init__(self, response):
def __init__(self, response, retry_after):
self.response = response

# The Wayback Machine does not generally include a `Retry-After` header
# at the time of this writing, but this code is included in case they
# add it in the future. The standard recommends it:
# https://tools.ietf.org/html/rfc6585#section-4
retry_header = response.headers.get('Retry-After')
self.retry_after = int(retry_header) if retry_header else None
self.retry_after = retry_after

message = 'Wayback rate limit exceeded'
if self.retry_after:
Expand Down
25 changes: 24 additions & 1 deletion wayback/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime, timezone
import email.utils
import pytest
import time
from .._utils import memento_url_data, rate_limited
from .._utils import memento_url_data, rate_limited, parse_retry_after


class TestMementoUrlData:
Expand Down Expand Up @@ -75,3 +76,25 @@ def test_simultaneous_ratelimits(self):
with rate_limited(calls_per_second=3, group='sim2'):
pass
assert 1.66 <= time.time() - start_time <= 1.7


class TestParseRetryAfter:
def test_seconds(self):
assert 30 == parse_retry_after('30')

def test_date(self):
result = parse_retry_after(email.utils.formatdate(time.time() + 30))
assert 29 <= result <= 31

def test_no_input(self):
assert parse_retry_after(None) is None

def test_invalid_input(self):
assert 0 == parse_retry_after('hello!')

def test_negative_seconds(self):
assert 0 == parse_retry_after('-10')

def test_negative_date(self):
result = parse_retry_after(email.utils.formatdate(time.time() - 30))
assert 0 == result