Skip to content

Commit

Permalink
use WHITELISTED_DOMAINS to conditionally disable postprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterirving committed Nov 9, 2024
1 parent 4186114 commit 1dd95d9
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 6 deletions.
2 changes: 1 addition & 1 deletion proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def process_response(response, url):
if isinstance(content, bytes):
content = content.decode('utf-8', errors='replace')

content = transcode_html(content)
content = transcode_html(content, url)
else:
print(f"Content type {content_type} should not be transcoded, passing through unchanged")

Expand Down
20 changes: 15 additions & 5 deletions utils/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,15 @@ def transcode_content(content):

return content.encode('utf-8')

def transcode_html(html):

def transcode_html(html, url=None):
"""
Uses BeautifulSoup to transcode payloads of the text/html content type
"""
if isinstance(html, bytes):
html = html.decode("utf-8", errors="replace")

# Handle character conversion regardless of whitelist status
if CONVERT_CHARACTERS:
for key, replacement in CONVERSION_TABLE.items():
if isinstance(replacement, bytes):
Expand All @@ -65,7 +67,7 @@ def transcode_html(html):

soup = BeautifulSoup(html, "html.parser")

# Convert all HTTPS resources to HTTP through our proxy
# Always convert HTTPS to HTTP regardless of whitelist status
for tag in soup(['link', 'script', 'img', 'a', 'iframe']):
# Handle src attributes
if 'src' in tag.attrs:
Expand All @@ -81,20 +83,28 @@ def transcode_html(html):
elif tag['href'].startswith('//'): # Handle protocol-relative URLs
tag['href'] = 'http:' + tag['href']

if SIMPLIFY_HTML:
# Check if domain is whitelisted
is_whitelisted = False
if url:
from urllib.parse import urlparse
domain = urlparse(url).netloc
is_whitelisted = any(domain.endswith(whitelisted) for whitelisted in WHITELISTED_DOMAINS)

# Only perform tag/attribute stripping if the domain is not whitelisted and SIMPLIFY_HTML is True
if SIMPLIFY_HTML and not is_whitelisted:
for tag in soup(TAGS_TO_STRIP):
tag.decompose()
for tag in soup():
for attr in ATTRIBUTES_TO_STRIP:
if attr in tag.attrs:
del tag[attr]

# Remove any meta refresh tags that might use HTTPS
# Always handle meta refresh tags
for tag in soup.find_all('meta', attrs={'http-equiv': 'refresh'}):
if 'content' in tag.attrs and 'https://' in tag['content']:
tag['content'] = tag['content'].replace('https://', 'http://')

# Handle CSS with inline URLs
# Always handle CSS with inline URLs
for tag in soup.find_all(['style', 'link']):
if tag.string:
tag.string = tag.string.replace('https://', 'http://')
Expand Down

0 comments on commit 1dd95d9

Please sign in to comment.