Skip to content

Commit

Permalink
Fix website loader content encoding detection (#482)
Browse files Browse the repository at this point in the history
  • Loading branch information
sissbruecker authored May 30, 2023
1 parent 5d48c64 commit 4220ea0
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 2 deletions.
4 changes: 3 additions & 1 deletion bookmarks/services/website_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,10 @@ def load_page(url: str):
logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})')

# Stop reading if we have parsed end of head tag
if '</head>'.encode('utf-8') in content:
end_of_head = '</head>'.encode('utf-8')
if end_of_head in content:
logger.debug(f'Found closing head tag after {size} bytes')
content = content.split(end_of_head)[0] + end_of_head
break
# Stop reading if we exceed limit
if size > MAX_CONTENT_LIMIT:
Expand Down
14 changes: 13 additions & 1 deletion bookmarks/tests/test_website_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_load_page_limits_large_documents(self):
expected_content_size = 6 * 1024 * 1000
self.assertEqual(expected_content_size, len(content))

def test_load_page_stops_reading_at_closing_head_tag(self):
def test_load_page_stops_reading_at_end_of_head(self):
with mock.patch('requests.get') as mock_get:
mock_get.return_value = MockStreamingResponse(num_chunks=10, chunk_size=1024 * 1000,
insert_head_after_chunk=0)
Expand All @@ -69,6 +69,18 @@ def test_load_page_stops_reading_at_closing_head_tag(self):
expected_content_size = 1 * 1024 * 1000 + len('</head>')
self.assertEqual(expected_content_size, len(content))

def test_load_page_removes_bytes_after_end_of_head(self):
with mock.patch('requests.get') as mock_get:
mock_response = MockStreamingResponse(num_chunks=1, chunk_size=0)
mock_response.chunks[0] = '<head>人</head>'.encode('utf-8')
# add a single byte that can't be decoded to utf-8
mock_response.chunks[0] += 0xff.to_bytes(1, 'big')
mock_get.return_value = mock_response
content = website_loader.load_page('https://example.com')

# verify that byte after head was removed, content parsed as utf-8
self.assertEqual(content, '<head>人</head>')

def test_load_website_metadata(self):
with mock.patch('bookmarks.services.website_loader.load_page') as mock_load_page:
mock_load_page.return_value = self.render_html_document('test title', 'test description')
Expand Down

0 comments on commit 4220ea0

Please sign in to comment.