-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Attempt different character encodings when previewing a URL. #11077
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Fix a long-standing bug when attempting to preview URLs which are in the `windows-1252` character encoding. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -295,8 +295,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes: | |
with open(media_info.filename, "rb") as file: | ||
body = file.read() | ||
|
||
encoding = get_html_media_encoding(body, media_info.media_type) | ||
tree = decode_body(body, encoding) | ||
tree = decode_body(body, media_info.uri, media_info.media_type) | ||
if tree is not None: | ||
# Check if this HTML document points to oEmbed information and | ||
# defer to that. | ||
|
@@ -632,16 +631,19 @@ def try_remove_parent_dirs(dirs: Iterable[str]) -> None: | |
logger.debug("No media removed from url cache") | ||
|
||
|
||
def get_html_media_encoding(body: bytes, content_type: str) -> str: | ||
def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterable[str]: | ||
""" | ||
Get the encoding of the body based on the (presumably) HTML body or media_type. | ||
Get potential encoding of the body based on the (presumably) HTML body or the content-type header. | ||
|
||
The precedence used for finding a character encoding is: | ||
|
||
1. meta tag with a charset declared. | ||
1. <meta> tag with a charset declared. | ||
2. The XML document's character encoding attribute. | ||
3. The Content-Type header. | ||
4. Fallback to UTF-8. | ||
4. Fallback to utf-8. | ||
5. Fallback to windows-1252. | ||
|
||
This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector. | ||
|
||
Args: | ||
body: The HTML document, as bytes. | ||
|
@@ -653,36 +655,39 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str: | |
# Limit searches to the first 1kb, since it ought to be at the top. | ||
body_start = body[:1024] | ||
|
||
# Let's try and figure out if it has an encoding set in a meta tag. | ||
# Check if it has an encoding set in a meta tag. | ||
match = _charset_match.search(body_start) | ||
if match: | ||
return match.group(1).decode("ascii") | ||
yield match.group(1).decode("ascii") | ||
|
||
# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> | ||
|
||
# If we didn't find a match, see if it an XML document with an encoding. | ||
# Check if it has an XML document with an encoding. | ||
match = _xml_encoding_match.match(body_start) | ||
if match: | ||
return match.group(1).decode("ascii") | ||
yield match.group(1).decode("ascii") | ||
|
||
# If we don't find a match, we'll look at the HTTP Content-Type, and | ||
# if that doesn't exist, we'll fall back to UTF-8. | ||
content_match = _content_type_match.match(content_type) | ||
if content_match: | ||
return content_match.group(1) | ||
# Check the HTTP Content-Type header for a character set. | ||
if content_type: | ||
content_match = _content_type_match.match(content_type) | ||
if content_match: | ||
yield content_match.group(1) | ||
|
||
return "utf-8" | ||
# Finally, fallback to UTF-8, then windows-1252. | ||
yield "utf-8" | ||
yield "windows-1252" | ||
|
||
|
||
def decode_body( | ||
body: bytes, request_encoding: Optional[str] = None | ||
body: bytes, uri: str, content_type: Optional[str] = None | ||
) -> Optional["etree.Element"]: | ||
""" | ||
This uses lxml to parse the HTML document. | ||
|
||
Args: | ||
body: The HTML document, as bytes. | ||
request_encoding: The character encoding of the body, as a string. | ||
uri: The URI used to download the body. | ||
content_type: The Content-Type header. | ||
|
||
Returns: | ||
The parsed HTML body, or None if an error occurred during processed. | ||
|
@@ -691,32 +696,25 @@ def decode_body( | |
if not body: | ||
return None | ||
|
||
for encoding in get_html_media_encodings(body, content_type): | ||
try: | ||
body_str = body.decode(encoding) | ||
except Exception: | ||
pass | ||
else: | ||
break | ||
else: | ||
logger.warning("Unable to decode HTML body for %s", uri) | ||
return None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a thought... would there be value in falling back to latin-1 (iso8859-1) after trying windows-1252? The advantage would be that all bytes will decode in latin-1 and so you will always get a resulting string. The drawback is that if the content wasn't really encoded with latin-1, then you're just returning noise. (However, the same issue applies to using windows-1252 as a fallback, it's just that windows-1252 has five invalid bytes on which decoding could fail whereas latin-1 has zero invalid bytes) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That could help, I think worse case you'd get some garbage characters, but hopefully some of the preview would still be useful. |
||
|
||
from lxml import etree | ||
|
||
# Create an HTML parser. If this fails, log and return no metadata. | ||
try: | ||
parser = etree.HTMLParser(recover=True, encoding=request_encoding) | ||
except LookupError: | ||
# blindly consider the encoding as utf-8. | ||
parser = etree.HTMLParser(recover=True, encoding="utf-8") | ||
except Exception as e: | ||
logger.warning("Unable to create HTML parser: %s" % (e,)) | ||
return None | ||
# Create an HTML parser. | ||
parser = etree.HTMLParser(recover=True, encoding="utf-8") | ||
|
||
def _attempt_decode_body( | ||
body_attempt: Union[bytes, str] | ||
) -> Optional["etree.Element"]: | ||
# Attempt to parse the body. Returns None if the body was successfully | ||
# parsed, but no tree was found. | ||
return etree.fromstring(body_attempt, parser) | ||
|
||
# Attempt to parse the body. If this fails, log and return no metadata. | ||
try: | ||
return _attempt_decode_body(body) | ||
except UnicodeDecodeError: | ||
# blindly try decoding the body as utf-8, which seems to fix | ||
# the charset mismatches on https://google.com | ||
return _attempt_decode_body(body.decode("utf-8", "ignore")) | ||
# Attempt to parse the body. Returns None if the body was successfully | ||
# parsed, but no tree was found. | ||
return etree.fromstring(body_str, parser) | ||
|
||
|
||
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note: one easy thing to do to mitigate @squahtx 's concern would be to use a set to track which encodings this code has already yielded and then skip yielding that encoding a second time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The BeautifulSoup code does this and...I meant to implement it and failed to remember to do that. I'll do a follow-up! Thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, I didn't notice the set at all. Some of my comments might not have made sense since I thought there wasn't one.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See #11089.