Skip to content

Commit 95455db

Browse files
hroncokfrenzymadness
authored andcommitted
When host_whitelist is empty, don't bother parsing the URLs in allow_embedded_url
1 parent 88973ec commit 95455db

File tree

2 files changed

+12
-1
lines changed

2 files changed

+12
-1
lines changed

lxml_html_clean/clean.py

+2
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,8 @@ def allow_embedded_url(self, el, url):
524524
"""
525525
if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
526526
return False
527+
if not self.host_whitelist:
528+
return False
527529
parts = urlsplit(url)
528530
if parts.scheme not in ('http', 'https'):
529531
return False

tests/test_clean.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ def test_memory_usage_many_elements_with_long_tails(self):
348348

349349
self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")
350350

351-
def test_possibly_invalid_url(self):
351+
def test_possibly_invalid_url_with_whitelist(self):
352352
cleaner = Cleaner(host_whitelist=['google.com'])
353353
with warnings.catch_warnings(record=True) as w:
354354
warnings.simplefilter("always")
@@ -359,3 +359,12 @@ def test_possibly_invalid_url(self):
359359
self.assertIn("impossible to parse the hostname", str(w[-1].message))
360360
self.assertNotIn("google.com", result)
361361
self.assertNotIn("example.com", result)
362+
363+
def test_possibly_invalid_url_without_whitelist(self):
364+
cleaner = Cleaner()
365+
with warnings.catch_warnings(record=True) as w:
366+
warnings.simplefilter("always")
367+
result = cleaner.clean_html(r"<p><iframe src='http://example.com:\@google.com'> </iframe></p>")
368+
self.assertEqual(len(w), 0)
369+
self.assertNotIn("google.com", result)
370+
self.assertNotIn("example.com", result)

0 commit comments

Comments
 (0)