implement more robust image converting/caching

hunterirving · Sep 9, 2024 · 7a95f68 · 7a95f68
1 parent 95f8be6
commit 7a95f68
Show file tree

Hide file tree

Showing 6 changed files with 167 additions and 151 deletions.
diff --git a/extensions/hunterirving/hunterirving.py b/extensions/hunterirving/hunterirving.py
@@ -2,6 +2,7 @@
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
+import mimetypes
 
 DOMAIN = "hunterirving.com"
 
@@ -33,7 +34,13 @@ def handle_request(req):
 			response = requests.get(url)
 			response.raise_for_status()  # Raise an exception for bad status codes
 
-			# Try to decode with UTF-8 first, then fall back to ISO-8859-1
+			# Check if the content is an image	
+			content_type = response.headers.get('Content-Type', '')
+			if content_type.startswith('image/'):
+				# For images, return the content as-is
+				return response.content, response.status_code, {'Content-Type': content_type}
+
+			# For non-image content, proceed with HTML processing
 			try:
 				html_content = response.content.decode('utf-8')
 			except UnicodeDecodeError:

diff --git a/extensions/reddit/reddit.py b/extensions/reddit/reddit.py
@@ -10,6 +10,11 @@
 import mimetypes
 
 DOMAIN = "reddit.com"
+USER_AGENT = None
+
+def set_user_agent(user_agent):
+	global USER_AGENT
+	USER_AGENT = user_agent
 
 def handle_request(request):
 	if request.method != 'GET':
@@ -21,7 +26,8 @@ def handle_request(request):
 		url = url.replace("reddit.com", "old.reddit.com", 1)
 
 	try:
-		resp = requests.get(url, allow_redirects=True, timeout=10)
+		headers = {'User-Agent': USER_AGENT} if USER_AGENT else {}
+		resp = requests.get(url, headers=headers, allow_redirects=True, timeout=10)
 		resp.raise_for_status()
 		return process_content(resp.content, url)
 	except requests.RequestException as e:

diff --git a/extensions/waybackmachine/waybackmachine.py b/extensions/waybackmachine/waybackmachine.py
@@ -1,5 +1,5 @@
 from flask import request, render_template_string
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import urlparse, urlunparse, urljoin
 from waybackpy import WaybackMachineCDXServerAPI
 import requests
 from bs4 import BeautifulSoup
@@ -8,7 +8,6 @@
 import re
 import mimetypes
 import os
-from urllib.parse import urljoin
 
 DOMAIN = "web.archive.org"
 TARGET_DATE = "19960101"
@@ -73,22 +72,22 @@ def get_override_status():
 	global override_active
 	return override_active
 
-def transform_url(url, base_url):
-	# If the URL is absolute, return it as is
-	if url.startswith(('http://', 'https://')):
-		return url
+# def transform_url(url, base_url):
+# 	# If the URL is absolute, return it as is
+# 	if url.startswith(('http://', 'https://')):
+# 		return url
 
-	# If the URL starts with '/web/', it's a wayback machine URL
-	if url.startswith('/web/'):
-		return f'http://{DOMAIN}{url}'
+# 	# If the URL starts with '/web/', it's a wayback machine URL
+# 	if url.startswith('/web/'):
+# 		return f'http://{DOMAIN}{url}'
 
-	# If it's a relative URL
-	if not url.startswith('/'):
-		# Join it with the base URL
-		return urljoin(base_url, url)
+# 	# If it's a relative URL
+# 	if not url.startswith('/'):
+# 		# Join it with the base URL
+# 		return urljoin(base_url, url)
 
-	# For other cases (like URLs starting with '/'), join with the base URL
-	return urljoin(base_url, url)
+# 	# For other cases (like URLs starting with '/'), join with the base URL
+# 	return urljoin(base_url, url)
 
 def convert_ftp_to_http(url):
 	parsed = urlparse(url)
@@ -103,32 +102,59 @@ def process_html_content(content, base_url):
 
 	# Process all links
 	for a in soup.find_all('a', href=True):
-		a['href'] = transform_url(a['href'], base_url)
+		a['href'] = extract_original_url(a['href'], base_url)
 
 	# Process all images, scripts, and other resources
 	for tag in soup.find_all(['img', 'script', 'link', 'iframe'], src=True):
-		tag['src'] = transform_url(tag['src'], base_url)
+		tag['src'] = extract_original_url(tag['src'], base_url)
 	for tag in soup.find_all('link', href=True):
-		tag['href'] = transform_url(tag['href'], base_url)
+		tag['href'] = extract_original_url(tag['href'], base_url)
 
 	# Handle background images in style attributes
 	for tag in soup.find_all(style=True):
 		style = tag['style']
 		urls = re.findall(r'url\([\'"]?([^\'" \)]+)', style)
 		for url in urls:
-			new_url = transform_url(url, base_url)
+			new_url = extract_original_url(url, base_url)
 			style = style.replace(url, new_url)
 		tag['style'] = style
 
 	return str(soup)
 
-# def extract_original_url(wayback_url):
-# 	parsed = urlparse(wayback_url)
-# 	if parsed.netloc == DOMAIN and '/web/' in parsed.path:
-# 		path_parts = parsed.path.split('/', 3)
-# 		if len(path_parts) >= 4:
-# 			return 'http://' + path_parts[3]
-# 	return wayback_url
+def extract_original_url(url, base_url):
+    # Parse the base_url to extract the original domain and path
+    parsed_base = urlparse(base_url)
+    original_domain = parsed_base.netloc.split(':', 1)[0]  # Remove port if present
+    original_path = ''
+
+    if original_domain == DOMAIN:
+        # Extract the original URL from the Wayback Machine URL
+        parts = parsed_base.path.split('/', 4)
+        if len(parts) >= 5:
+            original_domain = parts[3]
+            original_path = '/'.join(parts[4:]).rsplit('/', 1)[0]
+    else:
+        # If it's not a Wayback Machine URL, use the path from parsed_base
+        original_path = '/'.join(parsed_base.path.split('/')[:-1])  # Remove the file name from the path
+
+    # Case 1: If the URL is already absolute and not a Wayback Machine URL, return it as is
+    if url.startswith(('http://', 'https://')) and DOMAIN not in url:
+        return url
+
+    # Case 2: If it's a Wayback Machine URL, extract the original URL
+    if url.startswith(('/web/', f'http://{DOMAIN}/web/', f'https://{DOMAIN}/web/')):
+        parts = url.split('/', 5)
+        if len(parts) >= 6:
+            return f'http://{parts[5]}'
+        return url
+
+    # Case 3: If it's a root-relative URL (starts with '/')
+    if url.startswith('/') and not url.startswith('/web/'):
+        return f'http://{original_domain}{url}'
+
+    # Case 4: For relative URLs, join with the original domain and path
+    full_base = f'http://{original_domain}{original_path}/'
+    return urljoin(full_base, url)
 
 def get_mime_type(url):
 	# Get the file extension
@@ -200,6 +226,7 @@ def handle_request(req):
 									  selected_year=selected_year,
 									  current_year=current_year,
 									  date_update_message=date_update_message), 200
+
 	# If we're here, override is active and we're handling a non-wayback domain
 	try:
 		print('Handling request for:', req.url)
@@ -232,6 +259,9 @@ def handle_request(req):
 		# Determine the content type
 		content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
 
+		if content_type.startswith('image/'):
+				return content, response.status_code, {'Content-Type': content_type}
+
 		if not content_type:
 			# If no content type is provided, guess based on the URL
 			content_type, _ = mimetypes.guess_type(url)

diff --git a/extensions/wiby/wiby.py b/extensions/wiby/wiby.py
@@ -10,14 +10,8 @@ def handle_request(request):
 		return handle_surprise(request)
 	else:
 		url = request.url.replace("https://", "http://", 1)
-		headers = {
-			"Accept": request.headers.get("Accept"),
-			"Accept-Language": request.headers.get("Accept-Language"),
-			"Referer": request.headers.get("Referer"),
-			"User-Agent": request.headers.get("User-Agent"),
-		}
 
-		resp = requests.get(url, headers=headers)
+		resp = requests.get(url)
 
 		# If it's the homepage, modify the page structure
 		if url == "http://wiby.me" or url == "http://wiby.me/":
@@ -42,7 +36,9 @@ def get_final_surprise_url():
 		if resp.status_code in (301, 302, 303, 307, 308):
 			url = urljoin(url, resp.headers['Location'])
 			redirects += 1
-		elif resp.status_code == 200:
+			continue
+
+		if resp.status_code == 200:
 			soup = BeautifulSoup(resp.content, 'html.parser')
 			meta_tag = soup.find("meta", attrs={"http-equiv": "refresh"})
 
@@ -52,12 +48,9 @@ def get_final_surprise_url():
 				if len(parts) > 1:
 					url = urljoin(url, parts[1].strip("'\""))
 					redirects += 1
-				else:
-					return url
-			else:
-				return url
-		else:
-			return url
+					continue
+
+		return url
 
 	return url
 

diff --git a/image_utils.py b/image_utils.py
@@ -0,0 +1,65 @@
+import os
+import io
+import hashlib
+import requests
+from PIL import Image
+import mimetypes
+
+CACHE_DIR = os.path.join(os.path.dirname(__file__), "cached_images")
+MAX_WIDTH = 512
+MAX_HEIGHT = 342
+USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
+
+def is_image_url(url):
+    mime_type, _ = mimetypes.guess_type(url)
+    return mime_type and mime_type.startswith('image/')
+
+def optimize_image(image_data):
+    img = Image.open(io.BytesIO(image_data))
+
+    if img.mode != 'RGBA':
+        img = img.convert('RGBA')
+
+    background = Image.new('RGBA', img.size, (255, 255, 255, 255))
+    img = Image.alpha_composite(background, img)
+    img = img.convert('RGB')
+
+    width, height = img.size
+    if width > MAX_WIDTH or height > MAX_HEIGHT:
+        ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
+        new_size = (int(width * ratio), int(height * ratio))
+        img = img.resize(new_size, Image.LANCZOS)
+
+    img = img.convert("L")
+    img = img.convert("1", dither=Image.FLOYDSTEINBERG)
+
+    output = io.BytesIO()
+    img.save(output, format="GIF", optimize=True)
+    return output.getvalue()
+
+def fetch_and_cache_image(url, content=None):
+    try:
+        print(f"Processing image: {url}")
+
+        file_name = hashlib.md5(url.encode()).hexdigest() + ".gif"
+        file_path = os.path.join(CACHE_DIR, file_name)
+
+        if not os.path.exists(file_path):
+            print(f"Optimizing and caching image: {url}")
+            if content is None:
+                response = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT})
+                response.raise_for_status()
+                content = response.content
+
+            optimized_image = optimize_image(content)
+            with open(file_path, 'wb') as f:
+                f.write(optimized_image)
+        else:
+            print(f"Image already cached: {url}")
+
+        cached_url = f"/cached_image/{file_name}"
+        print(f"Cached URL: {cached_url}")
+        return cached_url
+    except Exception as e:
+        print(f"Error processing image: {url}, Error: {str(e)}")
+        return None