Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove requests from public interfaces #157

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 41 additions & 142 deletions src/wayback/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,19 @@
import hashlib
import logging
import re
import requests
from requests.exceptions import (ChunkedEncodingError,
ConnectionError,
ContentDecodingError,
from requests.exceptions import (ConnectionError,
ProxyError,
RetryError,
Timeout)
import time
from urllib.parse import urljoin, urlparse
from urllib3.connectionpool import HTTPConnectionPool
from urllib3.exceptions import (ConnectTimeoutError,
MaxRetryError,
ReadTimeoutError)
from warnings import warn
from . import _utils, __version__
from ._models import CdxRecord, Memento
from ._http import WaybackHttpResponse, WaybackHttpAdapter
from .exceptions import (WaybackException,
UnexpectedResponseFormat,
BlockedByRobotsError,
Expand Down Expand Up @@ -148,7 +145,7 @@ def is_malformed_url(url):
return False


def is_memento_response(response):
def is_memento_response(response: WaybackHttpResponse) -> bool:
return 'Memento-Datetime' in response.headers


Expand All @@ -158,18 +155,6 @@ def cdx_hash(content):
return b32encode(hashlib.sha1(content).digest()).decode()


def read_and_close(response):
# Read content so it gets cached and close the response so
# we can release the connection for reuse. See:
# https://github.com/psf/requests/blob/eedd67462819f8dbf8c1c32e77f9070606605231/requests/sessions.py#L160-L163
try:
response.content
except (ChunkedEncodingError, ContentDecodingError, RuntimeError):
response.raw.read(decode_content=False)
finally:
response.close()


REDIRECT_PAGE_PATTERN = re.compile(r'Got an? HTTP 3\d\d response at crawl time', re.IGNORECASE)


Expand Down Expand Up @@ -271,75 +256,10 @@ def clean_memento_links(links, mode):
return result


#####################################################################
# HACK: handle malformed Content-Encoding headers from Wayback.
# When you send `Accept-Encoding: gzip` on a request for a memento, Wayback
# will faithfully gzip the response body. However, if the original response
# from the web server that was snapshotted was gzipped, Wayback screws up the
# `Content-Encoding` header on the memento response, leading any HTTP client to
# *not* decompress the gzipped body. Wayback folks have no clear timeline for
# a fix, hence the workaround here.
#
# More info in this issue:
# https://github.com/edgi-govdata-archiving/web-monitoring-processing/issues/309
#
# Example broken Wayback URL:
# http://web.archive.org/web/20181023233237id_/http://cwcgom.aoml.noaa.gov/erddap/griddap/miamiacidification.graph
#
if hasattr(HTTPConnectionPool, 'ResponseCls'):
# urllib3 v1.x:
#
# This subclass of urllib3's response class identifies the malformed headers
# and repairs them before instantiating the actual response object, so when
# it reads the body, it knows to decode it correctly.
#
# See what we're overriding from urllib3:
# https://github.com/urllib3/urllib3/blob/a6ec68a5c5c5743c59fe5c62c635c929586c429b/src/urllib3/response.py#L499-L526
class WaybackResponse(HTTPConnectionPool.ResponseCls):
@classmethod
def from_httplib(cls, httplib_response, **response_kwargs):
headers = httplib_response.msg
pairs = headers.items()
if ('content-encoding', '') in pairs and ('Content-Encoding', 'gzip') in pairs:
del headers['content-encoding']
headers['Content-Encoding'] = 'gzip'
return super().from_httplib(httplib_response, **response_kwargs)

HTTPConnectionPool.ResponseCls = WaybackResponse
else:
# urllib3 v2.x:
#
# Unfortunately, we can't wrap the `HTTPConnection.getresponse` method in
# urllib3 v2, since it may have read the response body before returning. So
# we patch the HTTPHeaderDict class instead.
from urllib3._collections import HTTPHeaderDict as Urllib3HTTPHeaderDict
_urllib3_header_init = Urllib3HTTPHeaderDict.__init__

def _new_header_init(self, headers=None, **kwargs):
if headers:
if isinstance(headers, (Urllib3HTTPHeaderDict, dict)):
pairs = list(headers.items())
else:
pairs = list(headers)
if (
('content-encoding', '') in pairs and
('Content-Encoding', 'gzip') in pairs
):
headers = [pair for pair in pairs
if pair[0].lower() != 'content-encoding']
headers.append(('Content-Encoding', 'gzip'))

return _urllib3_header_init(self, headers, **kwargs)

Urllib3HTTPHeaderDict.__init__ = _new_header_init
# END HACK
#####################################################################


class WaybackSession(_utils.DisableAfterCloseSession, requests.Session):
class WaybackSession(_utils.DisableAfterCloseSession):
"""
A custom session object that pools network connections and resources for
requests to the Wayback Machine.
Sessions manage HTTP requests and network traffic to web servers, adding
functionality like retries and rate limiting.

Parameters
----------
Expand Down Expand Up @@ -420,6 +340,7 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
# The memento limit is actually a generic Wayback limit.
'/': _utils.RateLimit.make_limit(memento_calls_per_second),
}
self.adapter = WaybackHttpAdapter()
# NOTE: the nice way to accomplish retry/backoff is with a urllib3:
# adapter = requests.adapters.HTTPAdapter(
# max_retries=Retry(total=5, backoff_factor=2,
Expand All @@ -433,15 +354,16 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
# with Wayback's APIs, but urllib3 logs a warning on every retry:
# https://github.com/urllib3/urllib3/blob/5b047b645f5f93900d5e2fc31230848c25eb1f5f/src/urllib3/connectionpool.py#L730-L737

# Customize the built-in `send()` with retryability and rate limiting.
def send(self, request: requests.PreparedRequest, **kwargs):
def request(self, method, url, timeout=-1, **kwargs) -> WaybackHttpResponse:
super().request()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This super() call is really just to keep _utils.DisableAfterCloseSession working, and I think the reality here is that we should probably just get rid of that utility. It provides some nice guardrails, but they don’t offer a huge amount of value.

start_time = time.time()
timeout = self.timeout if timeout is -1 else timeout
maximum = self.retries
retries = 0

url = urlparse(request.url)
parsed_url = urlparse(url)
for path, limit in self.rate_limts.items():
if url.path.startswith(path):
if parsed_url.path.startswith(path):
rate_limit = limit
break
else:
Expand All @@ -450,23 +372,23 @@ def send(self, request: requests.PreparedRequest, **kwargs):
while True:
retry_delay = 0
try:
logger.debug('sending HTTP request %s "%s", %s', request.method, request.url, kwargs)
logger.debug('sending HTTP request %s "%s", %s', method, url, kwargs)
rate_limit.wait()
response = super().send(request, **kwargs)
response = self.adapter.request(method, url, timeout=timeout, **kwargs)
retry_delay = self.get_retry_delay(retries, response)

if retries >= maximum or not self.should_retry(response):
if response.status_code == 429:
read_and_close(response)
response.close()
raise RateLimitError(response, retry_delay)
return response
else:
logger.debug('Received error response (status: %s), will retry', response.status_code)
read_and_close(response)
response.close(cache=False)
except WaybackSession.handleable_errors as error:
response = getattr(error, 'response', None)
if response is not None:
read_and_close(response)
response.close()

if retries >= maximum:
raise WaybackRetryError(retries, time.time() - start_time, error) from error
Expand All @@ -480,23 +402,6 @@ def send(self, request: requests.PreparedRequest, **kwargs):
time.sleep(retry_delay)
retries += 1

# Customize `request` in order to set a default timeout from the session.
# We can't do this in `send` because `request` always passes a `timeout`
# keyword to `send`. Inside `send`, we can't tell the difference between a
# user explicitly requesting no timeout and not setting one at all.
def request(self, method, url, **kwargs):
"""
Perform an HTTP request using this session. For arguments and return
values, see:
https://requests.readthedocs.io/en/latest/api/#requests.Session.request

If the ``timeout`` keyword argument is not set, it will default to the
session's ``timeout`` attribute.
"""
if 'timeout' not in kwargs:
kwargs['timeout'] = self.timeout
return super().request(method, url, **kwargs)

def should_retry(self, response):
# A memento may actually be a capture of an error, so don't retry it :P
if is_memento_response(response):
Expand Down Expand Up @@ -539,18 +444,12 @@ def get_retry_delay(self, retries, response=None):

def reset(self):
"Reset any network connections the session is using."
# Close really just closes all the adapters in `self.adapters`. We
# could do that directly, but `self.adapters` is not documented/public,
# so might be somewhat risky.
self.close(disable=False)
# Re-build the standard adapters. See:
# https://github.com/kennethreitz/requests/blob/v2.22.0/requests/sessions.py#L415-L418
self.mount('https://', requests.adapters.HTTPAdapter())
self.mount('http://', requests.adapters.HTTPAdapter())
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, I added this to help with some problems we were having managing connections in BIG jobs at EDGI, and that were later solved with better fixes. It’s kind of a weird method, and its presence is even weirder with the refactoring here. I think I should just drop it entirely.

self.adapter.close()


# TODO: add retry, backoff, cross_thread_backoff, and rate_limit options that
# create a custom instance of urllib3.utils.Retry
# create a custom instance of urllib3.utils.Retry?
class WaybackClient(_utils.DepthCountedContext):
"""
A client for retrieving data from the Internet Archive's Wayback Machine.
Expand All @@ -564,6 +463,8 @@ class WaybackClient(_utils.DepthCountedContext):
----------
session : WaybackSession, optional
"""
session: WaybackSession

def __init__(self, session=None):
self.session = session or WaybackSession()

Expand Down Expand Up @@ -785,22 +686,21 @@ def search(self, url, *, match_type=None, limit=1000, offset=None,
sent_query, next_query = next_query, None
response = self.session.request('GET', CDX_SEARCH_URL,
params=sent_query)
try:
# Read/cache the response and close straightaway. If we need
# to raise for status, we want to pre-emptively close the
# response so a user handling the error doesn't need to
# worry about it. If we don't raise here, we still want to
# close the connection so it doesn't leak when we move onto
# the next of results or when this iterator ends.
read_and_close(response)
response.raise_for_status()
except requests.exceptions.HTTPError as error:

# Read/cache the response and close straightaway. If we need
# to raise for status, we want to pre-emptively close the
# response so a user handling the error doesn't need to
# worry about it. If we don't raise here, we still want to
# close the connection so it doesn't leak when we move onto
# the next of results or when this iterator ends.
response.close()
if response.status_code >= 400:
if 'AdministrativeAccessControlException' in response.text:
raise BlockedSiteError(query['url'])
elif 'RobotAccessControlException' in response.text:
raise BlockedByRobotsError(query['url'])
else:
raise WaybackException(str(error))
raise WaybackException(f'HTTP {response.status_code} error for CDX search: "{query}"')

lines = iter(response.content.splitlines())

Expand Down Expand Up @@ -1018,12 +918,11 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
current_url, current_date, current_mode = _utils.memento_url_data(response.url)

# In view mode, redirects need special handling.
if current_mode == Mode.view.value:
redirect_url = response.redirect_url
if current_mode == Mode.view.value and not redirect_url:
redirect_url = detect_view_mode_redirect(response, current_date)
if redirect_url:
# Fix up response properties to be like other modes.
redirect = requests.Request('GET', redirect_url)
response._next = self.session.prepare_request(redirect)
response.headers['Memento-Datetime'] = current_date.strftime(
'%a, %d %b %Y %H:%M:%S %Z'
)
Expand Down Expand Up @@ -1062,11 +961,11 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
# rarely have been captured at the same time as the
# redirect itself. (See 2b)
playable = False
if response.next and (
if redirect_url and (
(len(history) == 0 and not exact) or
(len(history) > 0 and (previous_was_memento or not exact_redirects))
):
target_url, target_date, _ = _utils.memento_url_data(response.next.url)
target_url, target_date, _ = _utils.memento_url_data(redirect_url)
# A non-memento redirect is generally taking us to the
# closest-in-time capture of the same URL. Note that is
# NOT the next capture -- i.e. the one that would have
Expand Down Expand Up @@ -1095,7 +994,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
playable = True

if not playable:
read_and_close(response)
response.close()
message = response.headers.get('X-Archive-Wayback-Runtime-Error', '')
if (
('AdministrativeAccessControlException' in message) or
Expand All @@ -1109,7 +1008,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
raise BlockedByRobotsError(f'{url} is blocked by robots.txt')
elif message:
raise MementoPlaybackError(f'Memento at {url} could not be played: {message}')
elif response.ok:
elif response.is_success:
# TODO: Raise more specific errors for the possible
# cases here. We *should* only arrive here when
# there's a redirect and:
Expand All @@ -1124,21 +1023,21 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
raise MementoPlaybackError(f'{response.status_code} error while loading '
f'memento at {url}')

if response.next:
if redirect_url:
previous_was_memento = is_memento
read_and_close(response)
response.close()

# Wayback sometimes has circular memento redirects ¯\_(ツ)_/¯
urls.add(response.url)
if response.next.url in urls:
if redirect_url in urls:
raise MementoPlaybackError(f'Memento at {url} is circular')

# All requests are included in `debug_history`, but
# `history` only shows redirects that were mementos.
debug_history.append(response.url)
if is_memento:
history.append(memento)
response = self.session.send(response.next, allow_redirects=False)
response = self.session.request('GET', redirect_url, allow_redirects=False)
else:
break

Expand Down
Loading