Skip to content

Commit 3b644e9

Browse files
committed
Scan for JS code also in CSS comments
The `Cleaner()` now scans for hidden JavaScript code embedded within CSS comments. In certain contexts, such as within `<svg>` or `<math>` tags, `<style>` tags may lose their intended function, allowing comments like `/* foo */` to potentially be executed by the browser.
1 parent dcbc163 commit 3b644e9

File tree

2 files changed

+36
-14
lines changed

2 files changed

+36
-14
lines changed

Diff for: lxml_html_clean/clean.py

+19-14
Original file line numberDiff line numberDiff line change
@@ -581,22 +581,27 @@ def _has_sneaky_javascript(self, style):
581581
that and remove only the Javascript from the style; this catches
582582
more sneaky attempts.
583583
"""
584-
style = self._substitute_comments('', style)
585-
style = style.replace('\\', '')
586584
style = _substitute_whitespace('', style)
587585
style = style.lower()
588-
if _has_javascript_scheme(style):
589-
return True
590-
if 'expression(' in style:
591-
return True
592-
if '@import' in style:
593-
return True
594-
if '</noscript' in style:
595-
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
596-
return True
597-
if _looks_like_tag_content(style):
598-
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
599-
return True
586+
587+
for with_comments in True, False:
588+
if not with_comments:
589+
style = self._substitute_comments('', style)
590+
591+
style = style.replace('\\', '')
592+
593+
if _has_javascript_scheme(style):
594+
return True
595+
if 'expression(' in style:
596+
return True
597+
if '@import' in style:
598+
return True
599+
if '</noscript' in style:
600+
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
601+
return True
602+
if _looks_like_tag_content(style):
603+
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
604+
return True
600605
return False
601606

602607
def clean_html(self, html):

Diff for: tests/test_clean.py

+17
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,23 @@ def test_sneaky_js_in_math_style(self):
127127
b'<math><style>/* deleted */</style></math>',
128128
lxml.html.tostring(clean_html(s)))
129129

130+
def test_sneaky_js_in_style_comment_math_svg(self):
131+
for tag in "svg", "math":
132+
html = f'<{tag}><style>/*<img src onerror=alert(origin)>*/'
133+
s = lxml.html.fragment_fromstring(html)
134+
135+
self.assertEqual(
136+
f'<{tag}><style>/* deleted */</style></{tag}>'.encode(),
137+
lxml.html.tostring(clean_html(s)))
138+
139+
def test_sneaky_js_in_style_comment_noscript(self):
140+
html = '<noscript><style>/*</noscript><img src onerror=alert(origin)>*/'
141+
s = lxml.html.fragment_fromstring(html)
142+
143+
self.assertEqual(
144+
b'<noscript><style>/* deleted */</style></noscript>',
145+
lxml.html.tostring(clean_html(s)))
146+
130147
def test_sneaky_import_in_style(self):
131148
# Prevent "@@importimport" -> "@import" replacement etc.
132149
style_codes = [

0 commit comments

Comments
 (0)