From 46601e5ff21e0bfd9430d971c67f40b1f929668e Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 1 Feb 2024 10:41:40 -0800 Subject: [PATCH] Don't raise on *archived* rate limit errors (#159) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A memento can be a archive of an old rate limit error (status code 429) and in our feverish run to handle rate limit errors better at the end of 2023, we caused `WaybackSession.send()` to raise exceptions for both real rate limits *and* archived ones. However, the archived ones might be an actual memento that you were looking for, and should have been exempted from raising. This solves the issue by simply checking whether a response is a memento and returning it immediately without doing any other checks, since the *effective* status code for a memento is always 200. (Checking various attributes of a memento is complicated, so it’s better to just return them right away rather than remembering to make complex exceptions in all the places where various response attributes have to be treated differently for mementos.) Fixes #158. --- src/wayback/_client.py | 10 +- ...orks_on_archived_rate_limit_responses.yaml | 113 ++++++++++++++++++ src/wayback/tests/test_client.py | 10 ++ 3 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 src/wayback/tests/cassettes/test_get_memento_works_on_archived_rate_limit_responses.yaml diff --git a/src/wayback/_client.py b/src/wayback/_client.py index 0593e8f2..4d1a3cf8 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -455,7 +455,11 @@ def send(self, request: requests.PreparedRequest, **kwargs): response = super().send(request, **kwargs) retry_delay = self.get_retry_delay(retries, response) - if retries >= maximum or not self.should_retry(response): + if is_memento_response(response): + # Mementos are necessarily successful responses, so just + # return them without any other checks. + return response + elif retries >= maximum or not self.should_retry(response): if response.status_code == 429: read_and_close(response) raise RateLimitError(response, retry_delay) @@ -498,10 +502,6 @@ def request(self, method, url, **kwargs): return super().request(method, url, **kwargs) def should_retry(self, response): - # A memento may actually be a capture of an error, so don't retry it :P - if is_memento_response(response): - return False - return response.status_code in self.retryable_statuses def should_retry_error(self, error): diff --git a/src/wayback/tests/cassettes/test_get_memento_works_on_archived_rate_limit_responses.yaml b/src/wayback/tests/cassettes/test_get_memento_works_on_archived_rate_limit_responses.yaml new file mode 100644 index 00000000..dab5f462 --- /dev/null +++ b/src/wayback/tests/cassettes/test_get_memento_works_on_archived_rate_limit_responses.yaml @@ -0,0 +1,113 @@ +interactions: +- request: + body: null + headers: + Accept-Encoding: + - gzip, deflate + User-Agent: + - wayback/0.4.5.dev10+gb7a16cd.d20231218 (+https://github.com/edgi-govdata-archiving/wayback) + method: GET + uri: https://web.archive.org/web/20150129034904id_/http://www.reddit.com/r/PokemonGiveaway + response: + body: + string: "\n\n\n \n Too Many Requests\n + \ \n \n \n + \

whoa there, pardner!

\n \n\n\n

we're sorry, but you appear + to be a bot and we've seen too many requests\nfrom you lately. we enforce + a hard speed limit on requests that appear to come\nfrom bots to prevent abuse.

\n\n

if + you are not a bot but are spoofing one via your browser's user agent\nstring: + please change your user agent string to avoid seeing this message\nagain.

\n\n

please + wait 6 second(s) and try again.

\n\n

as a reminder to developers, + we recommend that clients make no\n more than one\n + \ request every two seconds to avoid seeing this message.

\n \n\n" + headers: + Connection: + - keep-alive + Content-Type: + - text/html; charset=UTF-8 + Date: + - Thu, 01 Feb 2024 18:20:31 GMT + Permissions-Policy: + - interest-cohort=() + Referrer-Policy: + - no-referrer-when-downgrade + Server: + - nginx + Transfer-Encoding: + - chunked + X-NA: + - '0' + X-NID: + - '-' + X-Page-Cache: + - MISS + X-RL: + - '1' + X-location: + - All + cache-control: + - max-age=1800 + content-security-policy: + - 'default-src ''self'' ''unsafe-eval'' ''unsafe-inline'' data: blob: archive.org + web.archive.org web-static.archive.org wayback-api.archive.org analytics.archive.org + pragma.archivelab.org' + link: + - ; rel="original", ; + rel="timemap"; type="application/link-format", ; + rel="timegate", ; + rel="first memento"; datetime="Tue, 26 Jun 2012 00:00:27 GMT", ; + rel="prev memento"; datetime="Tue, 09 Dec 2014 12:01:44 GMT", ; + rel="memento"; datetime="Thu, 29 Jan 2015 03:49:04 GMT", ; + rel="next memento"; datetime="Sun, 08 Feb 2015 03:27:10 GMT", ; + rel="last memento"; datetime="Fri, 20 Oct 2023 10:43:50 GMT" + memento-datetime: + - Thu, 29 Jan 2015 03:49:04 GMT + server-timing: + - exclusion.robots;dur=1.346979, exclusion.robots.policy;dur=1.258865, cdx.remote;dur=0.566878, + esindex;dur=0.070942, LoadShardBlock;dur=668.835646, PetaboxLoader3.datanode;dur=362.949615, + PetaboxLoader3.resolve;dur=109.386489, load_resource;dur=78.884440 + x-app-server: + - wwwb-app220 + x-archive-orig-cache-control: + - no-cache + x-archive-orig-cf-cache-status: + - EXPIRED + x-archive-orig-cf-ray: + - 1b02752d98b0012c-SJC + x-archive-orig-connection: + - close + x-archive-orig-content-length: + - '-1' + x-archive-orig-date: + - Thu, 29 Jan 2015 03:49:04 GMT + x-archive-orig-edge-control: + - bypass-cache + x-archive-orig-retry-after: + - '6' + x-archive-orig-server: + - cloudflare-nginx + x-archive-orig-vary: + - accept-encoding + x-archive-orig-x-content-type-options: + - nosniff + x-archive-orig-x-frame-options: + - SAMEORIGIN + x-archive-orig-x-moose: + - majestic + x-archive-orig-x-ua-compatible: + - IE=edge + x-archive-orig-x-xss-protection: + - 1; mode=block + x-archive-src: + - liveweb-20150129011011/live-20150129000440-wwwb-app16.us.archive.org.warc.gz + x-tr: + - '1820' + x-ts: + - '429' + status: + code: 429 + message: Too Many Requests +version: 1 diff --git a/src/wayback/tests/test_client.py b/src/wayback/tests/test_client.py index f236a766..0a30f82b 100644 --- a/src/wayback/tests/test_client.py +++ b/src/wayback/tests/test_client.py @@ -609,6 +609,16 @@ def test_get_memento_raises_no_memento_error(): '20170929002712') +@ia_vcr.use_cassette() +def test_get_memento_works_on_archived_rate_limit_responses(): + with WaybackClient() as client: + memento = client.get_memento('http://www.reddit.com/r/PokemonGiveaway', + timestamp=datetime(2015, 1, 29, 3, 49, 4), + exact=True) + assert 'http://www.reddit.com/r/PokemonGiveaway' == memento.url + assert 429 == memento.status_code + + @ia_vcr.use_cassette() def test_get_memento_follows_historical_redirects(): with WaybackClient() as client: