Skip to content

Commit 8b3c612

Browse files
committed
Raise warning for unstable URL parsing
`urllib.parse` does not perform any input validation so its output might be invalid as well. Because host_whitelist functionality relies on hostnames parsed from URLs, the result of `urlsplit` is newly compared with the result of simple regex parser and If they differ, a warning is raised.
1 parent 8ce436d commit 8b3c612

File tree

4 files changed

+65
-3
lines changed

4 files changed

+65
-3
lines changed

Diff for: lxml_html_clean/__init__.pyi

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@ from .clean import (
55
autolink as autolink,
66
autolink_html as autolink_html,
77
word_break as word_break,
8-
word_break_html as word_break_html
8+
word_break_html as word_break_html,
9+
LXMLHTMLCleanWarning as LXMLHTMLCleanWarning,
10+
AmbiguousURLWarning as AmbiguousURLWarning,
911
)

Diff for: lxml_html_clean/clean.py

+37-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import re
1111
from collections import deque
1212
from urllib.parse import urlsplit, unquote_plus
13+
import warnings
1314

1415
from lxml import etree
1516
from lxml.html import defs
@@ -18,7 +19,7 @@
1819

1920

2021
__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
21-
'word_break', 'word_break_html']
22+
'word_break', 'word_break_html', 'LXMLHTMLCleanWarning', 'AmbiguousURLWarning']
2223

2324
# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
2425
# Particularly the CSS cleaning; most of the tag cleaning is integrated now
@@ -100,6 +101,33 @@ def fromstring(string):
100101
return lxml_fromstring(_ascii_control_characters.sub("", string))
101102

102103

104+
# This regular expression is inspired by the one in urllib3.
105+
_URI_RE = re.compile(
106+
r"^(?:(?P<scheme>[a-zA-Z][a-zA-Z0-9+.-]*[a-zA-Z0-9]):)?"
107+
r"(?://(?P<authority>[^\\/?#]*))?"
108+
r"(?P<path>[^?#]*)"
109+
r"(?:\?(?P<query_string>[^#]*))?"
110+
r"(?:#(?P<fragment>.*))?$",
111+
re.UNICODE,
112+
)
113+
114+
115+
def _get_authority_from_url(url):
116+
match = _URI_RE.match(url)
117+
if match:
118+
return match.group("authority")
119+
else:
120+
return None
121+
122+
123+
class LXMLHTMLCleanWarning(Warning):
124+
pass
125+
126+
127+
class AmbiguousURLWarning(LXMLHTMLCleanWarning):
128+
pass
129+
130+
103131
class Cleaner:
104132
"""
105133
Instances cleans the document of each of the possible offending
@@ -499,6 +527,14 @@ def allow_embedded_url(self, el, url):
499527
parts = urlsplit(url)
500528
if parts.scheme not in ('http', 'https'):
501529
return False
530+
531+
authority = _get_authority_from_url(url)
532+
if (parts.netloc or authority) and parts.netloc != authority:
533+
warnings.warn(f"It's impossible to parse the hostname from URL: '{url}'! "
534+
"URL is not allowed because parsers returned ambiguous results.",
535+
AmbiguousURLWarning)
536+
return False
537+
502538
if parts.hostname in self.host_whitelist:
503539
return True
504540
return False

Diff for: lxml_html_clean/clean.pyi

+11
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ _DT = TypeVar("_DT", str, bytes, HtmlElement)
88
_ET_DT = TypeVar("_ET_DT", str, bytes, HtmlElement, _ElementTree[HtmlElement])
99

1010

11+
def _get_authority_from_url(url: str) -> str | None: ...
12+
13+
14+
class LXMLHTMLCleanWarning(Warning):
15+
pass
16+
17+
18+
class AmbiguousURLWarning(LXMLHTMLCleanWarning):
19+
pass
20+
21+
1122
class Cleaner:
1223
@overload # allow_tags present, remove_unknown_tags must be False
1324
def __init__(

Diff for: tests/test_clean.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
import gzip
33
import io
44
import unittest
5+
import warnings
56

67
import lxml.html
7-
from lxml_html_clean import Cleaner, clean_html
8+
from lxml_html_clean import AmbiguousURLWarning, Cleaner, clean_html, LXMLHTMLCleanWarning
89
from .utils import peak_memory_usage
910

1011

@@ -346,3 +347,15 @@ def test_memory_usage_many_elements_with_long_tails(self):
346347
mem = peak_memory_usage(cleaner.clean_html, html)
347348

348349
self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")
350+
351+
def test_possibly_invalid_url(self):
352+
cleaner = Cleaner(host_whitelist=['google.com'])
353+
with warnings.catch_warnings(record=True) as w:
354+
warnings.simplefilter("always")
355+
result = cleaner.clean_html(r"<p><iframe src='http://example.com:\@google.com'> </iframe></p>")
356+
self.assertGreaterEqual(len(w), 1)
357+
self.assertIs(w[-1].category, AmbiguousURLWarning)
358+
self.assertTrue(issubclass(w[-1].category, LXMLHTMLCleanWarning))
359+
self.assertIn("impossible to parse the hostname", str(w[-1].message))
360+
self.assertNotIn("google.com", result)
361+
self.assertNotIn("example.com", result)

0 commit comments

Comments
 (0)