Skip to content

Commit a074425

Browse files
committed
Remove only the CSS comment if a suspicious content is detected
1 parent 90bcfa8 commit a074425

File tree

3 files changed

+47
-26
lines changed

3 files changed

+47
-26
lines changed

Diff for: CHANGES.rst

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Bugs fixed
1616
within CSS comments. In certain contexts, such as within ``<svg>`` or ``<math>`` tags,
1717
``<style>`` tags may lose their intended function, allowing comments
1818
like ``/* foo */`` to potentially be executed by the browser.
19+
If a suspicious content is detected, only the comment is removed.
1920

2021
0.3.1 (2024-10-09)
2122
==================

Diff for: lxml_html_clean/clean.py

+40-22
Original file line numberDiff line numberDiff line change
@@ -366,8 +366,11 @@ def __call__(self, doc):
366366
new = _replace_css_import('', new)
367367
if self._has_sneaky_javascript(new):
368368
# Something tricky is going on...
369-
el.text = '/* deleted */'
370-
elif new != old:
369+
new = '/* deleted */'
370+
else:
371+
new = self._remove_sneaky_css_comments(new)
372+
373+
if new != old:
371374
el.text = new
372375
if self.comments:
373376
kill_tags.add(etree.Comment)
@@ -568,7 +571,9 @@ def _remove_javascript_link(self, link):
568571
return ''
569572
return link
570573

571-
_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
574+
_comments_re = re.compile(r'/\*.*?\*/', re.S)
575+
_find_comments = _comments_re.finditer
576+
_substitute_comments = _comments_re.sub
572577

573578
def _has_sneaky_javascript(self, style):
574579
"""
@@ -581,29 +586,42 @@ def _has_sneaky_javascript(self, style):
581586
that and remove only the Javascript from the style; this catches
582587
more sneaky attempts.
583588
"""
589+
style = self._substitute_comments('', style)
590+
style = style.replace('\\', '')
584591
style = _substitute_whitespace('', style)
585592
style = style.lower()
586-
587-
for with_comments in True, False:
588-
if not with_comments:
589-
style = self._substitute_comments('', style)
590-
591-
style = style.replace('\\', '')
592-
593-
if _has_javascript_scheme(style):
594-
return True
595-
if 'expression(' in style:
596-
return True
597-
if '@import' in style:
598-
return True
599-
if '</noscript' in style:
600-
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
601-
return True
602-
if _looks_like_tag_content(style):
603-
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
604-
return True
593+
if _has_javascript_scheme(style):
594+
return True
595+
if 'expression(' in style:
596+
return True
597+
if '@import' in style:
598+
return True
599+
if '</noscript' in style:
600+
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
601+
return True
602+
if _looks_like_tag_content(style):
603+
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
604+
return True
605605
return False
606606

607+
def _remove_sneaky_css_comments(self, style):
608+
"""
609+
Look for suspicious code in CSS comment and if found,
610+
remove the entire comment from the given style.
611+
612+
Browsers might parse <style> as an ordinary HTML tag
613+
in some specific context and that might cause code in CSS
614+
comments to run.
615+
"""
616+
for match in self._find_comments(style):
617+
comment = match.group(0)
618+
print("f", comment)
619+
if _has_javascript_scheme(comment) or _looks_like_tag_content(comment):
620+
style = style.replace(comment, "/* deleted */")
621+
print("f", style)
622+
623+
return style
624+
607625
def clean_html(self, html):
608626
result_type = type(html)
609627
if isinstance(html, (str, bytes)):

Diff for: tests/test_clean.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -129,19 +129,21 @@ def test_sneaky_js_in_math_style(self):
129129

130130
def test_sneaky_js_in_style_comment_math_svg(self):
131131
for tag in "svg", "math":
132-
html = f'<{tag}><style>/*<img src onerror=alert(origin)>*/'
132+
html = f'<{tag}><style>p {{color: red;}}/*<img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></{tag}>'
133133
s = lxml.html.fragment_fromstring(html)
134134

135+
expected = f'<{tag}><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></{tag}>'.encode()
136+
135137
self.assertEqual(
136-
f'<{tag}><style>/* deleted */</style></{tag}>'.encode(),
138+
expected,
137139
lxml.html.tostring(clean_html(s)))
138140

139141
def test_sneaky_js_in_style_comment_noscript(self):
140-
html = '<noscript><style>/*</noscript><img src onerror=alert(origin)>*/'
142+
html = '<noscript><style>p {{color: red;}}/*</noscript><img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></noscript>'
141143
s = lxml.html.fragment_fromstring(html)
142144

143145
self.assertEqual(
144-
b'<noscript><style>/* deleted */</style></noscript>',
146+
b'<noscript><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></noscript>',
145147
lxml.html.tostring(clean_html(s)))
146148

147149
def test_sneaky_import_in_style(self):

0 commit comments

Comments
 (0)