From 95455dbb2d7b0eecf6a341eabdee92830b0f7252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miro=20Hron=C4=8Dok?= Date: Wed, 9 Oct 2024 14:48:50 +0200 Subject: [PATCH] When host_whitelist is empty, don't bother parsing the URLs in allow_embedded_url --- lxml_html_clean/clean.py | 2 ++ tests/test_clean.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index b410382..a71da81 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -524,6 +524,8 @@ def allow_embedded_url(self, el, url): """ if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: return False + if not self.host_whitelist: + return False parts = urlsplit(url) if parts.scheme not in ('http', 'https'): return False diff --git a/tests/test_clean.py b/tests/test_clean.py index 85692a1..8c9bc20 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -348,7 +348,7 @@ def test_memory_usage_many_elements_with_long_tails(self): self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB") - def test_possibly_invalid_url(self): + def test_possibly_invalid_url_with_whitelist(self): cleaner = Cleaner(host_whitelist=['google.com']) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -359,3 +359,12 @@ def test_possibly_invalid_url(self): self.assertIn("impossible to parse the hostname", str(w[-1].message)) self.assertNotIn("google.com", result) self.assertNotIn("example.com", result) + + def test_possibly_invalid_url_without_whitelist(self): + cleaner = Cleaner() + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = cleaner.clean_html(r"

") + self.assertEqual(len(w), 0) + self.assertNotIn("google.com", result) + self.assertNotIn("example.com", result)