Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bad timestamps #85

Merged
merged 2 commits into from
Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion wayback/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,14 @@ def parse_timestamp(time_string):
"""
Given a Wayback-style timestamp string, return an equivalent ``datetime``.
"""
# Fix bad timestamps
timestamp_chars = list(time_string)
# If the timestamp has a day of "00"
if timestamp_chars[6:8] == ['0', '0']:
del timestamp_chars[6:8]
timestamp_chars.extend(['0', '0'])
return (datetime
.strptime(time_string, URL_DATE_FORMAT)
.strptime(''.join(timestamp_chars), URL_DATE_FORMAT)
.replace(tzinfo=timezone.utc))


Expand Down
26 changes: 26 additions & 0 deletions wayback/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,32 @@ def test_search_handles_no_length_cdx_records(requests_mock):
assert record_list[-1].length is None


def test_search_handles_bad_timestamp_cdx_records(requests_mock):
"""
The CDX index can contain a timestamp with an invalid day "00", which can't be
parsed into an timestamp. We should handle this.

Because these are rare and hard to get all in a single CDX query that isn't
*huge*, we use a made-up mock for this one instead of a VCR recording.
"""
with open(Path(__file__).parent / 'test_files' / 'bad_timestamp_cdx.txt') as f:
bad_cdx_data = f.read()

with WaybackClient() as client:
requests_mock.get('http://web.archive.org/cdx/search/cdx'
'?url=www.usatoday.com%2F%2A'
'&matchType=domain&filter=statuscode%3A200'
'&showResumeKey=true&resolveRevisits=true',
[{'status_code': 200, 'text': bad_cdx_data}])
records = client.search('www.usatoday.com/*',
matchType="domain",
filter_field="statuscode:200")

record_list = list(records)
assert 5 == len(record_list)
assert record_list[-1].timestamp.day == 24


@ia_vcr.use_cassette()
def test_get_memento():
with WaybackClient() as client:
Expand Down
5 changes: 5 additions & 0 deletions wayback/tests/test_files/bad_timestamp_cdx.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
com,usatoday)/2000/century/tech/003d.htm 20011120210446 http://www.usatoday.com:80/2000/century/tech/003d.htm text/html 200 EJTUZEVOPPFGLXXQK2KV4DPFRSOULYVN 3823
com,usatoday)/2000/century/tech/004.htm 20000125210430 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 EBWZW6DNCJK2PU2DYX2JX2SWD6NQMUXK 6822
com,usatoday)/2000/century/tech/004.htm 20000311052312 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 BTVE5SD57GD4HZHWISTWPLXRH7XONXW6 6214
com,usatoday)/2000/century/tech/004.htm 20000613174049 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 RT4WYWDBYOFDEIJ2ZI2HD5UMT7UH7LRC 6566
com,usatoday)/2000/century/tech/004.htm 20000800241623 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 PAJWSPCRQMVBTYWV4NPJPNDQHKWJC3OO 6177