Skip to content

Commit

Permalink
Fix bad timestamps (#85)
Browse files Browse the repository at this point in the history
Handle bad timestamps that have `00` for the day in CDX records.
  • Loading branch information
8W9aG committed Nov 17, 2021
1 parent b406f3d commit 02099ca
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 1 deletion.
8 changes: 7 additions & 1 deletion wayback/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,14 @@ def parse_timestamp(time_string):
"""
Given a Wayback-style timestamp string, return an equivalent ``datetime``.
"""
# Fix bad timestamps
timestamp_chars = list(time_string)
# If the timestamp has a day of "00"
if timestamp_chars[6:8] == ['0', '0']:
del timestamp_chars[6:8]
timestamp_chars.extend(['0', '0'])
return (datetime
.strptime(time_string, URL_DATE_FORMAT)
.strptime(''.join(timestamp_chars), URL_DATE_FORMAT)
.replace(tzinfo=timezone.utc))


Expand Down
26 changes: 26 additions & 0 deletions wayback/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,32 @@ def test_search_handles_no_length_cdx_records(requests_mock):
assert record_list[-1].length is None


def test_search_handles_bad_timestamp_cdx_records(requests_mock):
"""
The CDX index can contain a timestamp with an invalid day "00", which can't be
parsed into an timestamp. We should handle this.
Because these are rare and hard to get all in a single CDX query that isn't
*huge*, we use a made-up mock for this one instead of a VCR recording.
"""
with open(Path(__file__).parent / 'test_files' / 'bad_timestamp_cdx.txt') as f:
bad_cdx_data = f.read()

with WaybackClient() as client:
requests_mock.get('http://web.archive.org/cdx/search/cdx'
'?url=www.usatoday.com%2F%2A'
'&matchType=domain&filter=statuscode%3A200'
'&showResumeKey=true&resolveRevisits=true',
[{'status_code': 200, 'text': bad_cdx_data}])
records = client.search('www.usatoday.com/*',
matchType="domain",
filter_field="statuscode:200")

record_list = list(records)
assert 5 == len(record_list)
assert record_list[-1].timestamp.day == 24


@ia_vcr.use_cassette()
def test_get_memento():
with WaybackClient() as client:
Expand Down
5 changes: 5 additions & 0 deletions wayback/tests/test_files/bad_timestamp_cdx.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
com,usatoday)/2000/century/tech/003d.htm 20011120210446 http://www.usatoday.com:80/2000/century/tech/003d.htm text/html 200 EJTUZEVOPPFGLXXQK2KV4DPFRSOULYVN 3823
com,usatoday)/2000/century/tech/004.htm 20000125210430 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 EBWZW6DNCJK2PU2DYX2JX2SWD6NQMUXK 6822
com,usatoday)/2000/century/tech/004.htm 20000311052312 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 BTVE5SD57GD4HZHWISTWPLXRH7XONXW6 6214
com,usatoday)/2000/century/tech/004.htm 20000613174049 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 RT4WYWDBYOFDEIJ2ZI2HD5UMT7UH7LRC 6566
com,usatoday)/2000/century/tech/004.htm 20000800241623 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 PAJWSPCRQMVBTYWV4NPJPNDQHKWJC3OO 6177

0 comments on commit 02099ca

Please sign in to comment.