From 61781926db8fbda9aabbe73565a3160e8b84ba32 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 13 Oct 2021 14:52:42 -0400 Subject: [PATCH 1/3] Attempt different character encodings when previewing a URL. --- synapse/rest/media/v1/preview_url_resource.py | 76 +++++++++---------- tests/test_preview.py | 44 +++++++---- 2 files changed, 64 insertions(+), 56 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 5bddd21ef13c..c1acfb0c6ed7 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -295,8 +295,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes: with open(media_info.filename, "rb") as file: body = file.read() - encoding = get_html_media_encoding(body, media_info.media_type) - tree = decode_body(body, encoding) + tree = decode_body(body, media_info.media_type) if tree is not None: # Check if this HTML document points to oEmbed information and # defer to that. @@ -632,16 +631,19 @@ def try_remove_parent_dirs(dirs: Iterable[str]) -> None: logger.debug("No media removed from url cache") -def get_html_media_encoding(body: bytes, content_type: str) -> str: +def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterable[str]: """ - Get the encoding of the body based on the (presumably) HTML body or media_type. + Get potential encoding of the body based on the (presumably) HTML body or the content-type header. The precedence used for finding a character encoding is: - 1. meta tag with a charset declared. + 1. tag with a charset declared. 2. The XML document's character encoding attribute. 3. The Content-Type header. - 4. Fallback to UTF-8. + 4. Fallback to utf-8. + 5. Fallback to windows-1252. + + This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector. Args: body: The HTML document, as bytes. @@ -653,36 +655,38 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str: # Limit searches to the first 1kb, since it ought to be at the top. body_start = body[:1024] - # Let's try and figure out if it has an encoding set in a meta tag. + # Check if it has an encoding set in a meta tag. match = _charset_match.search(body_start) if match: - return match.group(1).decode("ascii") + yield match.group(1).decode("ascii") # TODO Support - # If we didn't find a match, see if it an XML document with an encoding. + # Check if it has an XML document with an encoding. match = _xml_encoding_match.match(body_start) if match: - return match.group(1).decode("ascii") + yield match.group(1).decode("ascii") - # If we don't find a match, we'll look at the HTTP Content-Type, and - # if that doesn't exist, we'll fall back to UTF-8. - content_match = _content_type_match.match(content_type) - if content_match: - return content_match.group(1) + # Check the HTTP Content-Type header for a character set. + if content_type: + content_match = _content_type_match.match(content_type) + if content_match: + yield content_match.group(1) - return "utf-8" + # Finally, fallback to UTF-8, then windows-1252. + yield "utf-8" + yield "windows-1252" def decode_body( - body: bytes, request_encoding: Optional[str] = None + body: bytes, content_type: Optional[str] = None ) -> Optional["etree.Element"]: """ This uses lxml to parse the HTML document. Args: body: The HTML document, as bytes. - request_encoding: The character encoding of the body, as a string. + content_type: The Content-Type header. Returns: The parsed HTML body, or None if an error occurred during processed. @@ -691,32 +695,22 @@ def decode_body( if not body: return None + for encoding in get_html_media_encodings(body, content_type): + try: + body_str = body.decode(encoding) + except Exception as e: + pass + else: + break + from lxml import etree - # Create an HTML parser. If this fails, log and return no metadata. - try: - parser = etree.HTMLParser(recover=True, encoding=request_encoding) - except LookupError: - # blindly consider the encoding as utf-8. - parser = etree.HTMLParser(recover=True, encoding="utf-8") - except Exception as e: - logger.warning("Unable to create HTML parser: %s" % (e,)) - return None + # Create an HTML parser. + parser = etree.HTMLParser(recover=True, encoding="utf-8") - def _attempt_decode_body( - body_attempt: Union[bytes, str] - ) -> Optional["etree.Element"]: - # Attempt to parse the body. Returns None if the body was successfully - # parsed, but no tree was found. - return etree.fromstring(body_attempt, parser) - - # Attempt to parse the body. If this fails, log and return no metadata. - try: - return _attempt_decode_body(body) - except UnicodeDecodeError: - # blindly try decoding the body as utf-8, which seems to fix - # the charset mismatches on https://google.com - return _attempt_decode_body(body.decode("utf-8", "ignore")) + # Attempt to parse the body. Returns None if the body was successfully + # parsed, but no tree was found. + return etree.fromstring(body_str, parser) def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: diff --git a/tests/test_preview.py b/tests/test_preview.py index 09e017b4d94c..af6cf6ae4740 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -15,7 +15,7 @@ from synapse.rest.media.v1.preview_url_resource import ( _calc_og, decode_body, - get_html_media_encoding, + get_html_media_encodings, summarize_paragraphs, ) @@ -306,11 +306,25 @@ def test_invalid_encoding2(self): og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) + def test_windows_1252(self): + """A body which uses windows-1252, but doesn't declare that.""" + html = b""" + + \xf3 + + Some text. + + + """ + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") + self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."}) + class MediaEncodingTestCase(unittest.TestCase): def test_meta_charset(self): """A character encoding is found via the meta tag.""" - encoding = get_html_media_encoding( + encodings = get_html_media_encodings( b""" @@ -319,10 +333,10 @@ def test_meta_charset(self): """, "text/html", ) - self.assertEqual(encoding, "ascii") + self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"]) # A less well-formed version. - encoding = get_html_media_encoding( + encodings = get_html_media_encodings( b""" < meta charset = ascii> @@ -331,11 +345,11 @@ def test_meta_charset(self): """, "text/html", ) - self.assertEqual(encoding, "ascii") + self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"]) def test_meta_charset_underscores(self): """A character encoding contains underscore.""" - encoding = get_html_media_encoding( + encodings = get_html_media_encodings( b""" @@ -344,11 +358,11 @@ def test_meta_charset_underscores(self): """, "text/html", ) - self.assertEqual(encoding, "Shift_JIS") + self.assertEqual(list(encodings), ["Shift_JIS", "utf-8", "windows-1252"]) def test_xml_encoding(self): """A character encoding is found via the meta tag.""" - encoding = get_html_media_encoding( + encodings = get_html_media_encodings( b""" @@ -356,11 +370,11 @@ def test_xml_encoding(self): """, "text/html", ) - self.assertEqual(encoding, "ascii") + self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"]) def test_meta_xml_encoding(self): """Meta tags take precedence over XML encoding.""" - encoding = get_html_media_encoding( + encodings = get_html_media_encodings( b""" @@ -370,7 +384,7 @@ def test_meta_xml_encoding(self): """, "text/html", ) - self.assertEqual(encoding, "UTF-16") + self.assertEqual(list(encodings), ["UTF-16", "ascii", "utf-8", "windows-1252"]) def test_content_type(self): """A character encoding is found via the Content-Type header.""" @@ -384,10 +398,10 @@ def test_content_type(self): 'text/html; charset=ascii";', ) for header in headers: - encoding = get_html_media_encoding(b"", header) - self.assertEqual(encoding, "ascii") + encodings = get_html_media_encodings(b"", header) + self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"]) def test_fallback(self): """A character encoding cannot be found in the body or header.""" - encoding = get_html_media_encoding(b"", "text/html") - self.assertEqual(encoding, "utf-8") + encodings = get_html_media_encodings(b"", "text/html") + self.assertEqual(list(encodings), ["utf-8", "windows-1252"]) From ca56c8c81e449749e3fff9e8b59466e548e894aa Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 13 Oct 2021 14:56:56 -0400 Subject: [PATCH 2/3] Newsfragment --- changelog.d/11077.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/11077.bugfix diff --git a/changelog.d/11077.bugfix b/changelog.d/11077.bugfix new file mode 100644 index 000000000000..dc35c86440e7 --- /dev/null +++ b/changelog.d/11077.bugfix @@ -0,0 +1 @@ +Fix a long-standing bug when attempting to preview URLs which are in the `windows-1252` character encoding. From eda74c86cc6eda9c828ebe97aea9ff37f68bf37d Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 13 Oct 2021 15:21:40 -0400 Subject: [PATCH 3/3] Log a warning if unable to decode the body. --- synapse/rest/media/v1/preview_url_resource.py | 10 +++++--- tests/test_preview.py | 24 +++++++++---------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index c1acfb0c6ed7..7ee91a0c0534 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -295,7 +295,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes: with open(media_info.filename, "rb") as file: body = file.read() - tree = decode_body(body, media_info.media_type) + tree = decode_body(body, media_info.uri, media_info.media_type) if tree is not None: # Check if this HTML document points to oEmbed information and # defer to that. @@ -679,13 +679,14 @@ def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterab def decode_body( - body: bytes, content_type: Optional[str] = None + body: bytes, uri: str, content_type: Optional[str] = None ) -> Optional["etree.Element"]: """ This uses lxml to parse the HTML document. Args: body: The HTML document, as bytes. + uri: The URI used to download the body. content_type: The Content-Type header. Returns: @@ -698,10 +699,13 @@ def decode_body( for encoding in get_html_media_encodings(body, content_type): try: body_str = body.decode(encoding) - except Exception as e: + except Exception: pass else: break + else: + logger.warning("Unable to decode HTML body for %s", uri) + return None from lxml import etree diff --git a/tests/test_preview.py b/tests/test_preview.py index af6cf6ae4740..c6789017bc7d 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -159,7 +159,7 @@ def test_simple(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -175,7 +175,7 @@ def test_comment(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -194,7 +194,7 @@ def test_comment2(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual( @@ -216,7 +216,7 @@ def test_script(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -230,7 +230,7 @@ def test_missing_title(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) @@ -245,7 +245,7 @@ def test_h1_as_title(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) @@ -260,7 +260,7 @@ def test_missing_title_and_broken_h1(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) @@ -268,13 +268,13 @@ def test_missing_title_and_broken_h1(self): def test_empty(self): """Test a body with no data in it.""" html = b"" - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") self.assertIsNone(tree) def test_no_tree(self): """A valid body with no tree in it.""" html = b"\x00" - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") self.assertIsNone(tree) def test_invalid_encoding(self): @@ -287,7 +287,7 @@ def test_invalid_encoding(self): """ - tree = decode_body(html, "invalid-encoding") + tree = decode_body(html, "http://example.com/test.html", "invalid-encoding") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -302,7 +302,7 @@ def test_invalid_encoding2(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) @@ -316,7 +316,7 @@ def test_windows_1252(self): """ - tree = decode_body(html) + tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})