From 7a95f685fddd6e4be25418c9574adb369d0b6c37 Mon Sep 17 00:00:00 2001
From: Hunter <hunter@hunterirving.com>
Date: Mon, 9 Sep 2024 14:22:22 -0400
Subject: [PATCH] implement more robust image converting/caching

---
 extensions/hunterirving/hunterirving.py     |   9 +-
 extensions/reddit/reddit.py                 |   8 +-
 extensions/waybackmachine/waybackmachine.py |  82 ++++++++----
 extensions/wiby/wiby.py                     |  21 ++--
 image_utils.py                              |  65 ++++++++++
 proxy.py                                    | 133 ++++----------------
 6 files changed, 167 insertions(+), 151 deletions(-)
 create mode 100644 image_utils.py

diff --git a/extensions/hunterirving/hunterirving.py b/extensions/hunterirving/hunterirving.py
index f70cb1b..89b4ba6 100644
--- a/extensions/hunterirving/hunterirving.py
+++ b/extensions/hunterirving/hunterirving.py
@@ -2,6 +2,7 @@
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
+import mimetypes
 
 DOMAIN = "hunterirving.com"
 
@@ -33,7 +34,13 @@ def handle_request(req):
 			response = requests.get(url)
 			response.raise_for_status()  # Raise an exception for bad status codes
 			
-			# Try to decode with UTF-8 first, then fall back to ISO-8859-1
+			# Check if the content is an image	
+			content_type = response.headers.get('Content-Type', '')
+			if content_type.startswith('image/'):
+				# For images, return the content as-is
+				return response.content, response.status_code, {'Content-Type': content_type}
+
+			# For non-image content, proceed with HTML processing
 			try:
 				html_content = response.content.decode('utf-8')
 			except UnicodeDecodeError:
diff --git a/extensions/reddit/reddit.py b/extensions/reddit/reddit.py
index c9169c6..7889ae8 100644
--- a/extensions/reddit/reddit.py
+++ b/extensions/reddit/reddit.py
@@ -10,6 +10,11 @@
 import mimetypes
 
 DOMAIN = "reddit.com"
+USER_AGENT = None
+
+def set_user_agent(user_agent):
+	global USER_AGENT
+	USER_AGENT = user_agent
 
 def handle_request(request):
 	if request.method != 'GET':
@@ -21,7 +26,8 @@ def handle_request(request):
 		url = url.replace("reddit.com", "old.reddit.com", 1)
 	
 	try:
-		resp = requests.get(url, allow_redirects=True, timeout=10)
+		headers = {'User-Agent': USER_AGENT} if USER_AGENT else {}
+		resp = requests.get(url, headers=headers, allow_redirects=True, timeout=10)
 		resp.raise_for_status()
 		return process_content(resp.content, url)
 	except requests.RequestException as e:
diff --git a/extensions/waybackmachine/waybackmachine.py b/extensions/waybackmachine/waybackmachine.py
index d7267f1..31bedbb 100644
--- a/extensions/waybackmachine/waybackmachine.py
+++ b/extensions/waybackmachine/waybackmachine.py
@@ -1,5 +1,5 @@
 from flask import request, render_template_string
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import urlparse, urlunparse, urljoin
 from waybackpy import WaybackMachineCDXServerAPI
 import requests
 from bs4 import BeautifulSoup
@@ -8,7 +8,6 @@
 import re
 import mimetypes
 import os
-from urllib.parse import urljoin
 
 DOMAIN = "web.archive.org"
 TARGET_DATE = "19960101"
@@ -73,22 +72,22 @@ def get_override_status():
 	global override_active
 	return override_active
 
-def transform_url(url, base_url):
-	# If the URL is absolute, return it as is
-	if url.startswith(('http://', 'https://')):
-		return url
+# def transform_url(url, base_url):
+# 	# If the URL is absolute, return it as is
+# 	if url.startswith(('http://', 'https://')):
+# 		return url
 
-	# If the URL starts with '/web/', it's a wayback machine URL
-	if url.startswith('/web/'):
-		return f'http://{DOMAIN}{url}'
+# 	# If the URL starts with '/web/', it's a wayback machine URL
+# 	if url.startswith('/web/'):
+# 		return f'http://{DOMAIN}{url}'
 
-	# If it's a relative URL
-	if not url.startswith('/'):
-		# Join it with the base URL
-		return urljoin(base_url, url)
+# 	# If it's a relative URL
+# 	if not url.startswith('/'):
+# 		# Join it with the base URL
+# 		return urljoin(base_url, url)
 
-	# For other cases (like URLs starting with '/'), join with the base URL
-	return urljoin(base_url, url)
+# 	# For other cases (like URLs starting with '/'), join with the base URL
+# 	return urljoin(base_url, url)
 
 def convert_ftp_to_http(url):
 	parsed = urlparse(url)
@@ -103,32 +102,59 @@ def process_html_content(content, base_url):
 	
 	# Process all links
 	for a in soup.find_all('a', href=True):
-		a['href'] = transform_url(a['href'], base_url)
+		a['href'] = extract_original_url(a['href'], base_url)
 	
 	# Process all images, scripts, and other resources
 	for tag in soup.find_all(['img', 'script', 'link', 'iframe'], src=True):
-		tag['src'] = transform_url(tag['src'], base_url)
+		tag['src'] = extract_original_url(tag['src'], base_url)
 	for tag in soup.find_all('link', href=True):
-		tag['href'] = transform_url(tag['href'], base_url)
+		tag['href'] = extract_original_url(tag['href'], base_url)
 	
 	# Handle background images in style attributes
 	for tag in soup.find_all(style=True):
 		style = tag['style']
 		urls = re.findall(r'url\([\'"]?([^\'" \)]+)', style)
 		for url in urls:
-			new_url = transform_url(url, base_url)
+			new_url = extract_original_url(url, base_url)
 			style = style.replace(url, new_url)
 		tag['style'] = style
 
 	return str(soup)
 
-# def extract_original_url(wayback_url):
-# 	parsed = urlparse(wayback_url)
-# 	if parsed.netloc == DOMAIN and '/web/' in parsed.path:
-# 		path_parts = parsed.path.split('/', 3)
-# 		if len(path_parts) >= 4:
-# 			return 'http://' + path_parts[3]
-# 	return wayback_url
+def extract_original_url(url, base_url):
+    # Parse the base_url to extract the original domain and path
+    parsed_base = urlparse(base_url)
+    original_domain = parsed_base.netloc.split(':', 1)[0]  # Remove port if present
+    original_path = ''
+    
+    if original_domain == DOMAIN:
+        # Extract the original URL from the Wayback Machine URL
+        parts = parsed_base.path.split('/', 4)
+        if len(parts) >= 5:
+            original_domain = parts[3]
+            original_path = '/'.join(parts[4:]).rsplit('/', 1)[0]
+    else:
+        # If it's not a Wayback Machine URL, use the path from parsed_base
+        original_path = '/'.join(parsed_base.path.split('/')[:-1])  # Remove the file name from the path
+
+    # Case 1: If the URL is already absolute and not a Wayback Machine URL, return it as is
+    if url.startswith(('http://', 'https://')) and DOMAIN not in url:
+        return url
+
+    # Case 2: If it's a Wayback Machine URL, extract the original URL
+    if url.startswith(('/web/', f'http://{DOMAIN}/web/', f'https://{DOMAIN}/web/')):
+        parts = url.split('/', 5)
+        if len(parts) >= 6:
+            return f'http://{parts[5]}'
+        return url
+
+    # Case 3: If it's a root-relative URL (starts with '/')
+    if url.startswith('/') and not url.startswith('/web/'):
+        return f'http://{original_domain}{url}'
+
+    # Case 4: For relative URLs, join with the original domain and path
+    full_base = f'http://{original_domain}{original_path}/'
+    return urljoin(full_base, url)
 
 def get_mime_type(url):
 	# Get the file extension
@@ -200,6 +226,7 @@ def handle_request(req):
 									  selected_year=selected_year,
 									  current_year=current_year,
 									  date_update_message=date_update_message), 200
+
 	# If we're here, override is active and we're handling a non-wayback domain
 	try:
 		print('Handling request for:', req.url)
@@ -232,6 +259,9 @@ def handle_request(req):
 		# Determine the content type
 		content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
 		
+		if content_type.startswith('image/'):
+				return content, response.status_code, {'Content-Type': content_type}
+
 		if not content_type:
 			# If no content type is provided, guess based on the URL
 			content_type, _ = mimetypes.guess_type(url)
diff --git a/extensions/wiby/wiby.py b/extensions/wiby/wiby.py
index 5846c50..5be0658 100644
--- a/extensions/wiby/wiby.py
+++ b/extensions/wiby/wiby.py
@@ -10,14 +10,8 @@ def handle_request(request):
 		return handle_surprise(request)
 	else:
 		url = request.url.replace("https://", "http://", 1)
-		headers = {
-			"Accept": request.headers.get("Accept"),
-			"Accept-Language": request.headers.get("Accept-Language"),
-			"Referer": request.headers.get("Referer"),
-			"User-Agent": request.headers.get("User-Agent"),
-		}
 
-		resp = requests.get(url, headers=headers)
+		resp = requests.get(url)
 		
 		# If it's the homepage, modify the page structure
 		if url == "http://wiby.me" or url == "http://wiby.me/":
@@ -42,7 +36,9 @@ def get_final_surprise_url():
 		if resp.status_code in (301, 302, 303, 307, 308):
 			url = urljoin(url, resp.headers['Location'])
 			redirects += 1
-		elif resp.status_code == 200:
+			continue
+
+		if resp.status_code == 200:
 			soup = BeautifulSoup(resp.content, 'html.parser')
 			meta_tag = soup.find("meta", attrs={"http-equiv": "refresh"})
 
@@ -52,12 +48,9 @@ def get_final_surprise_url():
 				if len(parts) > 1:
 					url = urljoin(url, parts[1].strip("'\""))
 					redirects += 1
-				else:
-					return url
-			else:
-				return url
-		else:
-			return url
+					continue
+
+		return url
 
 	return url
 
diff --git a/image_utils.py b/image_utils.py
new file mode 100644
index 0000000..92b7521
--- /dev/null
+++ b/image_utils.py
@@ -0,0 +1,65 @@
+import os
+import io
+import hashlib
+import requests
+from PIL import Image
+import mimetypes
+
+CACHE_DIR = os.path.join(os.path.dirname(__file__), "cached_images")
+MAX_WIDTH = 512
+MAX_HEIGHT = 342
+USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
+
+def is_image_url(url):
+    mime_type, _ = mimetypes.guess_type(url)
+    return mime_type and mime_type.startswith('image/')
+
+def optimize_image(image_data):
+    img = Image.open(io.BytesIO(image_data))
+    
+    if img.mode != 'RGBA':
+        img = img.convert('RGBA')
+    
+    background = Image.new('RGBA', img.size, (255, 255, 255, 255))
+    img = Image.alpha_composite(background, img)
+    img = img.convert('RGB')
+    
+    width, height = img.size
+    if width > MAX_WIDTH or height > MAX_HEIGHT:
+        ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
+        new_size = (int(width * ratio), int(height * ratio))
+        img = img.resize(new_size, Image.LANCZOS)
+    
+    img = img.convert("L")
+    img = img.convert("1", dither=Image.FLOYDSTEINBERG)
+    
+    output = io.BytesIO()
+    img.save(output, format="GIF", optimize=True)
+    return output.getvalue()
+
+def fetch_and_cache_image(url, content=None):
+    try:
+        print(f"Processing image: {url}")
+        
+        file_name = hashlib.md5(url.encode()).hexdigest() + ".gif"
+        file_path = os.path.join(CACHE_DIR, file_name)
+        
+        if not os.path.exists(file_path):
+            print(f"Optimizing and caching image: {url}")
+            if content is None:
+                response = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT})
+                response.raise_for_status()
+                content = response.content
+            
+            optimized_image = optimize_image(content)
+            with open(file_path, 'wb') as f:
+                f.write(optimized_image)
+        else:
+            print(f"Image already cached: {url}")
+        
+        cached_url = f"/cached_image/{file_name}"
+        print(f"Cached URL: {cached_url}")
+        return cached_url
+    except Exception as e:
+        print(f"Error processing image: {url}, Error: {str(e)}")
+        return None
\ No newline at end of file
diff --git a/proxy.py b/proxy.py
index 9d213d7..ce2c37a 100644
--- a/proxy.py
+++ b/proxy.py
@@ -10,6 +10,7 @@
 import hashlib
 import shutil
 import mimetypes
+from image_utils import is_image_url, fetch_and_cache_image, CACHE_DIR
 
 os.environ['FLASK_ENV'] = 'development'
 app = Flask(__name__)
@@ -21,11 +22,6 @@
 # Global variable to store the override extension
 override_extension = None
 
-# Global variables for image caching
-CACHE_DIR = os.path.join(os.path.dirname(__file__), "cached_images")
-MAX_WIDTH = 512
-MAX_HEIGHT = 342
-
 # User-Agent string
 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
 
@@ -55,88 +51,6 @@ def clear_image_cache():
 	extensions[ext] = module
 	domain_to_extension[module.DOMAIN] = module
 
-def is_image_url(url):
-	mime_type, _ = mimetypes.guess_type(url)
-	return mime_type and mime_type.startswith('image/')
-
-from PIL import Image
-import io
-
-def optimize_image(image_data):
-    img = Image.open(io.BytesIO(image_data))
-    
-    # Convert to RGBA if it's not already
-    if img.mode != 'RGBA':
-        img = img.convert('RGBA')
-    
-    # Create a white background
-    background = Image.new('RGBA', img.size, (255, 255, 255, 255))
-    
-    # Alpha composite the image onto the background
-    img = Image.alpha_composite(background, img)
-    
-    # Convert back to RGB
-    img = img.convert('RGB')
-    
-    # Calculate the new size while maintaining aspect ratio
-    width, height = img.size
-    if width > MAX_WIDTH or height > MAX_HEIGHT:
-        ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
-        new_size = (int(width * ratio), int(height * ratio))
-        img = img.resize(new_size, Image.LANCZOS)
-    
-    # Convert to grayscale
-    img = img.convert("L")
-    
-    # Apply Floyd-Steinberg dithering and convert to 1-bit black and white
-    img = img.convert("1", dither=Image.FLOYDSTEINBERG)
-    
-    # Save as 1-bit GIF
-    output = io.BytesIO()
-    img.save(output, format="GIF", optimize=True)
-    return output.getvalue()
-
-def fetch_and_cache_image(url):
-	try:
-		print(f"Fetching image: {url}")
-		response = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT})
-		response.raise_for_status()
-		
-		# Generate a unique filename based on the URL
-		file_name = hashlib.md5(url.encode()).hexdigest() + ".gif"
-		file_path = os.path.join(CACHE_DIR, file_name)
-		
-		# If the file doesn't exist, optimize and cache it
-		if not os.path.exists(file_path):
-			print(f"Optimizing and caching image: {url}")
-			optimized_image = optimize_image(response.content)
-			with open(file_path, 'wb') as f:
-				f.write(optimized_image)
-		else:
-			print(f"Image already cached: {url}")
-		
-		cached_url = f"/cached_image/{file_name}"
-		print(f"Cached URL: {cached_url}")
-		return cached_url
-	except Exception as e:
-		print(f"Error processing image: {url}, Error: {str(e)}")
-		return None
-
-def replace_image_urls(content, base_url):
-	soup = BeautifulSoup(content, 'html.parser')
-	for img in soup.find_all('img'):
-		src = img.get('src')
-		if src:
-			full_url = urljoin(base_url, src)
-			print(f"Processing image: {full_url}")
-			cached_url = fetch_and_cache_image(full_url)
-			if cached_url:
-				img['src'] = cached_url
-				print(f"Replaced image URL: {src} -> {cached_url}")
-			else:
-				print(f"Failed to cache image: {full_url}")
-	return str(soup)
-
 @app.route("/cached_image/<path:filename>")
 def serve_cached_image(filename):
 	return send_from_directory(CACHE_DIR, filename, mimetype='image/gif')
@@ -161,11 +75,16 @@ def handle_request(path):
 
 	override_response = handle_override_extension(scheme)
 	if override_response is not None:
-		return process_response_with_image_caching(override_response, request.url)
+		return process_response(override_response, request.url)
 
 	matching_extension = find_matching_extension(host)
 	if matching_extension:
-		return handle_matching_extension(matching_extension)
+		response = handle_matching_extension(matching_extension)
+		return process_response(response, request.url)
+	
+	# Only handle image requests here if we're not using an extension
+	if is_image_url(request.url) and not (override_extension or matching_extension):
+		return handle_image_request(request.url)
 
 	return handle_default_request()
 
@@ -177,7 +96,7 @@ def handle_override_extension(scheme):
 			if scheme in ['http', 'https', 'ftp']:
 				response = extensions[extension_name].handle_request(request)
 				check_override_status(extension_name)
-				return process_response_with_image_caching(response, request.url)
+				return process_response(response, request.url)
 			else:
 				print(f"Warning: Unsupported scheme '{scheme}' for override extension.")
 		else:
@@ -206,16 +125,12 @@ def handle_matching_extension(matching_extension):
 		override_extension = matching_extension.__name__
 		print(f"Override enabled for {override_extension}")
 	
-	# Use the original request URL as the base URL
-	return process_response_with_image_caching(response, request.url)
+	# Return the response directly
+	return response
+
+def process_response(response, url):
+	print(f"Processing response for URL: {url}")
 
-def process_response_with_image_caching(response, base_url):
-	print(f"Processing response for URL: {base_url}")
-	
-	# Check if the response is for an image URL
-	if is_image_url(base_url):
-		return handle_image_request(base_url)
-	
 	if isinstance(response, tuple):
 		if len(response) == 3:
 			content, status_code, headers = response
@@ -227,7 +142,6 @@ def process_response_with_image_caching(response, base_url):
 			status_code = 200
 			headers = {}
 	elif isinstance(response, Response):
-		print("Response is already a Flask Response object")
 		return response
 	else:
 		content = response
@@ -237,6 +151,14 @@ def process_response_with_image_caching(response, base_url):
 	content_type = headers.get('Content-Type', '').lower()
 	print(f"Content-Type: {content_type}")
 
+	if content_type.startswith('image/'):
+		# For image content, use the fetch_and_cache_image function
+		cached_url = fetch_and_cache_image(url, content)
+		if cached_url:
+			return send_from_directory(CACHE_DIR, os.path.basename(cached_url), mimetype='image/gif')
+		else:
+			return abort(404, "Image could not be processed")
+
 	# List of content types that should not be transcoded
 	non_transcode_types = [
 		'application/octet-stream',
@@ -264,11 +186,6 @@ def process_response_with_image_caching(response, base_url):
 		if isinstance(content, bytes):
 			content = content.decode('utf-8', errors='replace')
 		
-		# Apply image caching for HTML-like content
-		if content_type.startswith('text/html') or '<html' in content.lower():
-			print("Applying image caching to HTML content")
-			content = replace_image_urls(content, base_url)
-		
 		content = transcode_html(content, app.config["DISABLE_CHAR_CONVERSION"])
 	else:
 		print(f"Content type {content_type} should not be transcoded, passing through unchanged")
@@ -280,6 +197,7 @@ def process_response_with_image_caching(response, base_url):
 	print("Finished processing response")
 	return response
 
+# handle requests not handled by extensions
 def handle_default_request():
 	url = request.url.replace("https://", "http://", 1)
 	headers = prepare_headers()
@@ -287,14 +205,11 @@ def handle_default_request():
 	print(f"Handling default request for URL: {url}")
 	
 	try:
-		if is_image_url(url):
-			return handle_image_request(url)
-		
 		resp = send_request(url, headers)
 		content = resp.content
 		status_code = resp.status_code
 		headers = dict(resp.headers)
-		return process_response_with_image_caching((content, status_code, headers), url)
+		return process_response((content, status_code, headers), url)
 	except Exception as e:
 		print(f"Error in handle_default_request: {str(e)}")
 		return abort(500, ERROR_HEADER + str(e))