Handle Location headers that are absolute paths (#60)

Most redirects in Wayback redirect to a complete URL, with headers like: Location: http://web.archive.org/web/20201027215555id_/https://www.whitehouse.gov/administration/eop/ostp/about/student/faqs But some include only an absolute path, (which is still valid) e.g: Location: /web/20201027215555id_/whitehouse.gov/ostp/about/student/faqs We weren't correctly handling the latter case, leading to exceptions while parsing headers. Fixes #59.
edgi-govdata-archiving · Nov 4, 2020 · c2ca979 · c2ca979
1 parent ee40997
commit c2ca979
Show file tree

Hide file tree

Showing 5 changed files with 506 additions and 4 deletions.
diff --git a/docs/source/release-history.rst b/docs/source/release-history.rst
@@ -2,6 +2,12 @@
 Release History
 ===============
 
+v0.3.0 Alpha 2 (2020-11-04)
+---------------------------
+
+Fixes a bug in the new :class:`wayback.Memento` type where header parsing would fail for mementos with path-based ``Location`` headers. (`#60 <https://github.com/edgi-govdata-archiving/wayback/pull/60>`_)
+
+
 v0.3.0 Alpha 1 (2020-10-20)
 ---------------------------
 

diff --git a/wayback/_client.py b/wayback/_client.py
@@ -721,7 +721,7 @@ def get_memento(self, url, datetime=None, mode=Mode.original, *,
                                       mode=current_mode,
                                       memento_url=response.url,
                                       status_code=response.status_code,
-                                      headers=Memento.parse_memento_headers(response.headers),
+                                      headers=Memento.parse_memento_headers(response.headers, response.url),
                                       encoding=response.encoding,
                                       raw=response,
                                       raw_headers=response.headers,

diff --git a/wayback/_models.py b/wayback/_models.py
@@ -1,4 +1,5 @@
 from collections import namedtuple
+from urllib.parse import urlparse
 from ._utils import memento_url_data
 
 
@@ -249,14 +250,17 @@ def __exit__(self, *_args):
         self.close()
 
     @classmethod
-    def parse_memento_headers(cls, raw_headers):
+    def parse_memento_headers(cls, raw_headers, url='http://web.archive.org/'):
         """
         Extract historical headers from the Memento HTTP response's headers.
 
         Parameters
         ----------
         raw_headers : dict
             A dict of HTTP headers from the Memento's HTTP response.
+        url : str, optional
+            The URL of the resource the headers are being parsed for. It's used
+            when header data contains relative/incomplete URL information.
 
         Returns
         -------
@@ -281,7 +285,14 @@ def parse_memento_headers(cls, raw_headers):
         # The `Location` header for a redirect does not have an X-Archive-Orig-
         # version, and the normal location header point to the next *Wayback*
         # URL, so we need to parse it to get the historical redirect URL.
-        if 'Location' in raw_headers:
-            headers['Location'], _, _ = memento_url_data(raw_headers['Location'])
+        if 'Location' not in headers and 'Location' in raw_headers:
+            raw_location = raw_headers['Location']
+            # Some Wayback redirects provide a complete URL with a scheme and
+            # host in the `Location` header, but others provide only a path.
+            if raw_location.startswith('/'):
+                base_data = urlparse(url)
+                raw_location = f'{base_data.scheme}://{base_data.netloc}{raw_location}'
+
+            headers['Location'], _, _ = memento_url_data(raw_location)
 
         return headers