edgi-govdata-archiving · Mr0grog · Dec 26, 2023 · Dec 26, 2023 · Dec 29, 2023 · Dec 29, 2023
diff --git a/src/wayback/_client.py b/src/wayback/_client.py
@@ -19,22 +19,19 @@
 import hashlib
 import logging
 import re
-import requests
-from requests.exceptions import (ChunkedEncodingError,
-                                 ConnectionError,
-                                 ContentDecodingError,
+from requests.exceptions import (ConnectionError,
                                  ProxyError,
                                  RetryError,
                                  Timeout)
 import time
 from urllib.parse import urljoin, urlparse
-from urllib3.connectionpool import HTTPConnectionPool
 from urllib3.exceptions import (ConnectTimeoutError,
                                 MaxRetryError,
                                 ReadTimeoutError)
 from warnings import warn
 from . import _utils, __version__
 from ._models import CdxRecord, Memento
+from ._http import WaybackHttpResponse, WaybackHttpAdapter
 from .exceptions import (WaybackException,
                          UnexpectedResponseFormat,
                          BlockedByRobotsError,
@@ -148,7 +145,7 @@ def is_malformed_url(url):
     return False
 
 
-def is_memento_response(response):
+def is_memento_response(response: WaybackHttpResponse) -> bool:
     return 'Memento-Datetime' in response.headers
 
 
@@ -158,18 +155,6 @@ def cdx_hash(content):
     return b32encode(hashlib.sha1(content).digest()).decode()
 
 
-def read_and_close(response):
-    # Read content so it gets cached and close the response so
-    # we can release the connection for reuse. See:
-    # https://github.com/psf/requests/blob/eedd67462819f8dbf8c1c32e77f9070606605231/requests/sessions.py#L160-L163
-    try:
-        response.content
-    except (ChunkedEncodingError, ContentDecodingError, RuntimeError):
-        response.raw.read(decode_content=False)
-    finally:
-        response.close()
-
-
 REDIRECT_PAGE_PATTERN = re.compile(r'Got an? HTTP 3\d\d response at crawl time', re.IGNORECASE)
 
 
@@ -271,75 +256,10 @@ def clean_memento_links(links, mode):
     return result
 
 
-#####################################################################
-# HACK: handle malformed Content-Encoding headers from Wayback.
-# When you send `Accept-Encoding: gzip` on a request for a memento, Wayback
-# will faithfully gzip the response body. However, if the original response
-# from the web server that was snapshotted was gzipped, Wayback screws up the
-# `Content-Encoding` header on the memento response, leading any HTTP client to
-# *not* decompress the gzipped body. Wayback folks have no clear timeline for
-# a fix, hence the workaround here.
-#
-# More info in this issue:
-# https://github.com/edgi-govdata-archiving/web-monitoring-processing/issues/309
-#
-# Example broken Wayback URL:
-# http://web.archive.org/web/20181023233237id_/http://cwcgom.aoml.noaa.gov/erddap/griddap/miamiacidification.graph
-#
-if hasattr(HTTPConnectionPool, 'ResponseCls'):
-    # urllib3 v1.x:
-    #
-    # This subclass of urllib3's response class identifies the malformed headers
-    # and repairs them before instantiating the actual response object, so when
-    # it reads the body, it knows to decode it correctly.
-    #
-    # See what we're overriding from urllib3:
-    # https://github.com/urllib3/urllib3/blob/a6ec68a5c5c5743c59fe5c62c635c929586c429b/src/urllib3/response.py#L499-L526
-    class WaybackResponse(HTTPConnectionPool.ResponseCls):
-        @classmethod
-        def from_httplib(cls, httplib_response, **response_kwargs):
-            headers = httplib_response.msg
-            pairs = headers.items()
-            if ('content-encoding', '') in pairs and ('Content-Encoding', 'gzip') in pairs:
-                del headers['content-encoding']
-                headers['Content-Encoding'] = 'gzip'
-            return super().from_httplib(httplib_response, **response_kwargs)
-
-    HTTPConnectionPool.ResponseCls = WaybackResponse
-else:
-    # urllib3 v2.x:
-    #
-    # Unfortunately, we can't wrap the `HTTPConnection.getresponse` method in
-    # urllib3 v2, since it may have read the response body before returning. So
-    # we patch the HTTPHeaderDict class instead.
-    from urllib3._collections import HTTPHeaderDict as Urllib3HTTPHeaderDict
-    _urllib3_header_init = Urllib3HTTPHeaderDict.__init__
-
-    def _new_header_init(self, headers=None, **kwargs):
-        if headers:
-            if isinstance(headers, (Urllib3HTTPHeaderDict, dict)):
-                pairs = list(headers.items())
-            else:
-                pairs = list(headers)
-            if (
-                ('content-encoding', '') in pairs and
-                ('Content-Encoding', 'gzip') in pairs
-            ):
-                headers = [pair for pair in pairs
-                           if pair[0].lower() != 'content-encoding']
-                headers.append(('Content-Encoding', 'gzip'))
-
-        return _urllib3_header_init(self, headers, **kwargs)
-
-    Urllib3HTTPHeaderDict.__init__ = _new_header_init
-# END HACK
-#####################################################################
-
-
-class WaybackSession(_utils.DisableAfterCloseSession, requests.Session):
+class WaybackSession(_utils.DisableAfterCloseSession):
     """
-    A custom session object that pools network connections and resources for
-    requests to the Wayback Machine.
+    Sessions manage HTTP requests and network traffic to web servers, adding
+    functionality like retries and rate limiting.
 
     Parameters
     ----------
@@ -420,6 +340,7 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
             # The memento limit is actually a generic Wayback limit.
             '/': _utils.RateLimit.make_limit(memento_calls_per_second),
         }
+        self.adapter = WaybackHttpAdapter()
         # NOTE: the nice way to accomplish retry/backoff is with a urllib3:
         #     adapter = requests.adapters.HTTPAdapter(
         #         max_retries=Retry(total=5, backoff_factor=2,
@@ -433,15 +354,16 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
         # with Wayback's APIs, but urllib3 logs a warning on every retry:
         # https://github.com/urllib3/urllib3/blob/5b047b645f5f93900d5e2fc31230848c25eb1f5f/src/urllib3/connectionpool.py#L730-L737
 
-    # Customize the built-in `send()` with retryability and rate limiting.
-    def send(self, request: requests.PreparedRequest, **kwargs):
+    def request(self, method, url, timeout=-1, **kwargs) -> WaybackHttpResponse:
+        super().request()
         start_time = time.time()
+        timeout = self.timeout if timeout is -1 else timeout
         maximum = self.retries
         retries = 0
 
-        url = urlparse(request.url)
+        parsed_url = urlparse(url)
         for path, limit in self.rate_limts.items():
-            if url.path.startswith(path):
+            if parsed_url.path.startswith(path):
                 rate_limit = limit
                 break
         else:
@@ -450,23 +372,23 @@ def send(self, request: requests.PreparedRequest, **kwargs):
         while True:
             retry_delay = 0
             try:
-                logger.debug('sending HTTP request %s "%s", %s', request.method, request.url, kwargs)
+                logger.debug('sending HTTP request %s "%s", %s', method, url, kwargs)
                 rate_limit.wait()
-                response = super().send(request, **kwargs)
+                response = self.adapter.request(method, url, timeout=timeout, **kwargs)
                 retry_delay = self.get_retry_delay(retries, response)
 
                 if retries >= maximum or not self.should_retry(response):
                     if response.status_code == 429:
-                        read_and_close(response)
+                        response.close()
                         raise RateLimitError(response, retry_delay)
                     return response
                 else:
                     logger.debug('Received error response (status: %s), will retry', response.status_code)
-                    read_and_close(response)
+                    response.close(cache=False)
             except WaybackSession.handleable_errors as error:
                 response = getattr(error, 'response', None)
                 if response is not None:
-                    read_and_close(response)
+                    response.close()
 
                 if retries >= maximum:
                     raise WaybackRetryError(retries, time.time() - start_time, error) from error
@@ -480,23 +402,6 @@ def send(self, request: requests.PreparedRequest, **kwargs):
             time.sleep(retry_delay)
             retries += 1
 
-    # Customize `request` in order to set a default timeout from the session.
-    # We can't do this in `send` because `request` always passes a `timeout`
-    # keyword to `send`. Inside `send`, we can't tell the difference between a
-    # user explicitly requesting no timeout and not setting one at all.
-    def request(self, method, url, **kwargs):
-        """
-        Perform an HTTP request using this session. For arguments and return
-        values, see:
-        https://requests.readthedocs.io/en/latest/api/#requests.Session.request
-
-        If the ``timeout`` keyword argument is not set, it will default to the
-        session's ``timeout`` attribute.
-        """
-        if 'timeout' not in kwargs:
-            kwargs['timeout'] = self.timeout
-        return super().request(method, url, **kwargs)
-
     def should_retry(self, response):
         # A memento may actually be a capture of an error, so don't retry it :P
         if is_memento_response(response):
@@ -539,18 +444,12 @@ def get_retry_delay(self, retries, response=None):
 
     def reset(self):
         "Reset any network connections the session is using."
-        # Close really just closes all the adapters in `self.adapters`. We
-        # could do that directly, but `self.adapters` is not documented/public,
-        # so might be somewhat risky.
         self.close(disable=False)
-        # Re-build the standard adapters. See:
-        # https://github.com/kennethreitz/requests/blob/v2.22.0/requests/sessions.py#L415-L418
-        self.mount('https://', requests.adapters.HTTPAdapter())
-        self.mount('http://', requests.adapters.HTTPAdapter())
+        self.adapter.close()
 
 
 # TODO: add retry, backoff, cross_thread_backoff, and rate_limit options that
-# create a custom instance of urllib3.utils.Retry
+# create a custom instance of urllib3.utils.Retry?
 class WaybackClient(_utils.DepthCountedContext):
     """
     A client for retrieving data from the Internet Archive's Wayback Machine.
@@ -564,6 +463,8 @@ class WaybackClient(_utils.DepthCountedContext):
     ----------
     session : WaybackSession, optional
     """
+    session: WaybackSession
+
     def __init__(self, session=None):
         self.session = session or WaybackSession()
 
@@ -785,22 +686,21 @@ def search(self, url, *, match_type=None, limit=1000, offset=None,
             sent_query, next_query = next_query, None
             response = self.session.request('GET', CDX_SEARCH_URL,
                                             params=sent_query)
-            try:
-                # Read/cache the response and close straightaway. If we need
-                # to raise for status, we want to pre-emptively close the
-                # response so a user handling the error doesn't need to
-                # worry about it. If we don't raise here, we still want to
-                # close the connection so it doesn't leak when we move onto
-                # the next of results or when this iterator ends.
-                read_and_close(response)
-                response.raise_for_status()
-            except requests.exceptions.HTTPError as error:
+
+            # Read/cache the response and close straightaway. If we need
+            # to raise for status, we want to pre-emptively close the
+            # response so a user handling the error doesn't need to
+            # worry about it. If we don't raise here, we still want to
+            # close the connection so it doesn't leak when we move onto
+            # the next of results or when this iterator ends.
+            response.close()
+            if response.status_code >= 400:
                 if 'AdministrativeAccessControlException' in response.text:
                     raise BlockedSiteError(query['url'])
                 elif 'RobotAccessControlException' in response.text:
                     raise BlockedByRobotsError(query['url'])
                 else:
-                    raise WaybackException(str(error))
+                    raise WaybackException(f'HTTP {response.status_code} error for CDX search: "{query}"')
 
             lines = iter(response.content.splitlines())
 
@@ -1018,12 +918,11 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
             current_url, current_date, current_mode = _utils.memento_url_data(response.url)
 
             # In view mode, redirects need special handling.
-            if current_mode == Mode.view.value:
+            redirect_url = response.redirect_url
+            if current_mode == Mode.view.value and not redirect_url:
                 redirect_url = detect_view_mode_redirect(response, current_date)
                 if redirect_url:
                     # Fix up response properties to be like other modes.
-                    redirect = requests.Request('GET', redirect_url)
-                    response._next = self.session.prepare_request(redirect)
                     response.headers['Memento-Datetime'] = current_date.strftime(
                         '%a, %d %b %Y %H:%M:%S %Z'
                     )
@@ -1062,11 +961,11 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
                 # rarely have been captured at the same time as the
                 # redirect itself. (See 2b)
                 playable = False
-                if response.next and (
+                if redirect_url and (
                     (len(history) == 0 and not exact) or
                     (len(history) > 0 and (previous_was_memento or not exact_redirects))
                 ):
-                    target_url, target_date, _ = _utils.memento_url_data(response.next.url)
+                    target_url, target_date, _ = _utils.memento_url_data(redirect_url)
                     # A non-memento redirect is generally taking us to the
                     # closest-in-time capture of the same URL. Note that is
                     # NOT the next capture -- i.e. the one that would have
@@ -1095,7 +994,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
                             playable = True
 
                 if not playable:
-                    read_and_close(response)
+                    response.close()
                     message = response.headers.get('X-Archive-Wayback-Runtime-Error', '')
                     if (
                         ('AdministrativeAccessControlException' in message) or
@@ -1109,7 +1008,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
                         raise BlockedByRobotsError(f'{url} is blocked by robots.txt')
                     elif message:
                         raise MementoPlaybackError(f'Memento at {url} could not be played: {message}')
-                    elif response.ok:
+                    elif response.is_success:
                         # TODO: Raise more specific errors for the possible
                         # cases here. We *should* only arrive here when
                         # there's a redirect and:
@@ -1124,21 +1023,21 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *,
                         raise MementoPlaybackError(f'{response.status_code} error while loading '
                                                    f'memento at {url}')
 
-            if response.next:
+            if redirect_url:
                 previous_was_memento = is_memento
-                read_and_close(response)
+                response.close()
 
                 # Wayback sometimes has circular memento redirects ¯\_(ツ)_/¯
                 urls.add(response.url)
-                if response.next.url in urls:
+                if redirect_url in urls:
                     raise MementoPlaybackError(f'Memento at {url} is circular')
 
                 # All requests are included in `debug_history`, but
                 # `history` only shows redirects that were mementos.
                 debug_history.append(response.url)
                 if is_memento:
                     history.append(memento)
-                response = self.session.send(response.next, allow_redirects=False)
+                response = self.session.request('GET', redirect_url, allow_redirects=False)
             else:
                 break