From c5d816f86eb3707d72a8ecf5f3823e0daa1b3808 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Tue, 12 Nov 2024 13:18:24 +0100 Subject: [PATCH 1/3] Scan for JS code also in CSS comments The `Cleaner()` now scans for hidden JavaScript code embedded within CSS comments. In certain contexts, such as within `` or `` tags, `' - return True + + for with_comments in True, False: + if not with_comments: + style = self._substitute_comments('', style) + + style = style.replace('\\', '') + + if _has_javascript_scheme(style): + return True + if 'expression(' in style: + return True + if '@import' in style: + return True + if '' + return True return False def clean_html(self, html): diff --git a/tests/test_clean.py b/tests/test_clean.py index 8c9bc20..5e844b9 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -127,6 +127,23 @@ def test_sneaky_js_in_math_style(self): b'', lxml.html.tostring(clean_html(s))) + def test_sneaky_js_in_style_comment_math_svg(self): + for tag in "svg", "math": + html = f'<{tag}>'.encode(), + lxml.html.tostring(clean_html(s))) + + def test_sneaky_js_in_style_comment_noscript(self): + html = '', + lxml.html.tostring(clean_html(s))) + def test_sneaky_import_in_style(self): # Prevent "@@importimport" -> "@import" replacement etc. style_codes = [ From d4759552942e88ba9cfcb2774ec49ada86f760d9 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Tue, 12 Nov 2024 13:20:12 +0100 Subject: [PATCH 2/3] Release 0.4.0 --- CHANGES.rst | 11 +++++++++++ setup.cfg | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8c4e986..7433327 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,17 @@ lxml_html_clean changelog Unreleased ========== +0.4.0 (2024-11-12) +================== + +Bugs fixed +---------- + +* The ``Cleaner()`` now scans for hidden JavaScript code embedded + within CSS comments. In certain contexts, such as within ```` or ```` tags, + ``' - return True + if _has_javascript_scheme(style): + return True + if 'expression(' in style: + return True + if '@import' in style: + return True + if '' + return True return False + def _remove_sneaky_css_comments(self, style): + """ + Look for suspicious code in CSS comment and if found, + remove the entire comment from the given style. + + Browsers might parse ' s = lxml.html.fragment_fromstring(html) + expected = f'<{tag}>'.encode() + self.assertEqual( - f'<{tag}>'.encode(), + expected, lxml.html.tostring(clean_html(s))) def test_sneaky_js_in_style_comment_noscript(self): - html = '' s = lxml.html.fragment_fromstring(html) self.assertEqual( - b'', + b'', lxml.html.tostring(clean_html(s))) def test_sneaky_import_in_style(self):