porting image conversion out of wayback

hunterirving · Sep 9, 2024 · 95f8be6 · 95f8be6
1 parent 8823bb7
commit 95f8be6
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 150 deletions.
diff --git a/extensions/reddit/reddit.py b/extensions/reddit/reddit.py
@@ -10,86 +10,18 @@
 import mimetypes
 
 DOMAIN = "reddit.com"
-CACHE_DIR = os.path.join(os.path.dirname(__file__), "cached_images")
-image_counter = 0
-MAX_WIDTH = 512
-MAX_HEIGHT = 342
-
-def clear_image_cache():
-	global image_counter
-	if os.path.exists(CACHE_DIR):
-		shutil.rmtree(CACHE_DIR)
-	os.makedirs(CACHE_DIR, exist_ok=True)
-	image_counter = 0
-
-# Call this function when the extension is loaded
-clear_image_cache()
-
-def optimize_image(image_data):
-	img = Image.open(io.BytesIO(image_data))
-
-	# Calculate the new size while maintaining aspect ratio
-	width, height = img.size
-	if width > MAX_WIDTH or height > MAX_HEIGHT:
-		ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
-		new_size = (int(width * ratio), int(height * ratio))
-		img = img.resize(new_size, Image.LANCZOS)
-
-	# Convert to black and white
-	img = img.convert("1")
-
-	# Save as 1-bit GIF
-	output = io.BytesIO()
-	img.save(output, format="GIF", optimize=True)
-	return output.getvalue()
-
-def fetch_and_cache_image(url):
-	global image_counter
-	try:
-		response = requests.get(url, stream=True)
-		response.raise_for_status()
-
-		# Optimize the image
-		optimized_image = optimize_image(response.content)
-
-		# Increment the counter and use it for the filename
-		image_counter += 1
-		file_name = f"img_{image_counter:04d}.gif"
-		file_path = os.path.join(CACHE_DIR, file_name)
-
-		with open(file_path, 'wb') as f:
-			f.write(optimized_image)
-
-		return f"http://reddit.com/cached_image/{file_name}"
-	except Exception as e:
-		print(f"Error processing image: {str(e)}")
-		return None
 
 def handle_request(request):
 	if request.method != 'GET':
 		return Response("Only GET requests are supported", status=405)
 
 	url = request.url
-	if url.startswith("http://reddit.com/cached_image/"):
-		file_name = url.split("/")[-1]
-		file_path = os.path.join(CACHE_DIR, file_name)
-		if os.path.exists(file_path):
-			with open(file_path, 'rb') as f:
-				return Response(f.read(), mimetype='image/gif')
-		else:
-			return Response("Image not found", status=404)
-
+
 	if not url.startswith(('http://old.reddit.com', 'https://old.reddit.com')):
 		url = url.replace("reddit.com", "old.reddit.com", 1)
 
-	headers = {
-		'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
-		'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-		'Accept-Language': 'en-US,en;q=0.5',
-	}
-
 	try:
-		resp = requests.get(url, headers=headers, allow_redirects=True, timeout=10)
+		resp = requests.get(url, allow_redirects=True, timeout=10)
 		resp.raise_for_status()
 		return process_content(resp.content, url)
 	except requests.RequestException as e:
@@ -201,12 +133,10 @@ def process_content(content, url):
 							enclosing_a = img.find_parent('a')
 							if enclosing_a and enclosing_a.has_attr('href'):
 								img_src = enclosing_a['href']
-								cached_url = fetch_and_cache_image(img_src)
-								if cached_url:
-									new_img = new_soup.new_tag('img', src=cached_url, width="50", height="40")
-									d.append(new_img)
-									d.append(" ")  # Add space between images
-
+								new_img = new_soup.new_tag('img', src=img_src, width="50", height="40")
+								d.append(new_img)
+								d.append(" ")  # Add space between images
+
 					# Add post content if it exists
 					usertext_body = thing.find('div', class_='usertext-body')
 					if usertext_body:

diff --git a/extensions/waybackmachine/waybackmachine.py b/extensions/waybackmachine/waybackmachine.py
@@ -8,6 +8,7 @@
 import re
 import mimetypes
 import os
+from urllib.parse import urljoin
 
 DOMAIN = "web.archive.org"
 TARGET_DATE = "19960101"
@@ -72,40 +73,22 @@ def get_override_status():
 	global override_active
 	return override_active
 
-def transform_url(url):
-    # If the URL is relative and starts with '/web/', prepend the DOMAIN
-    if url.startswith('/web/'):
-        return f'http://{DOMAIN}{url}'
+def transform_url(url, base_url):
+	# If the URL is absolute, return it as is
+	if url.startswith(('http://', 'https://')):
+		return url
 
-    # If the URL is relative (doesn't start with a scheme or '/'), leave it as is
-    if not re.match(r'^[a-zA-Z]+://|^/', url):
-        return url
+	# If the URL starts with '/web/', it's a wayback machine URL
+	if url.startswith('/web/'):
+		return f'http://{DOMAIN}{url}'
 
-    # Parse the URL
-    parsed = urlparse(url)
+	# If it's a relative URL
+	if not url.startswith('/'):
+		# Join it with the base URL
+		return urljoin(base_url, url)
 
-    # Regular expression to match Wayback Machine URL pattern
-    wayback_pattern = r'^/web/(\d{14})/(.+)'
-
-    # Case 1: URL starts with "/web/" followed by 14 digits
-    if parsed.path.startswith('/web/'):
-        match = re.match(wayback_pattern, parsed.path)
-        if match:
-            original_url = match.group(2)
-            return convert_ftp_to_http(original_url)
-
-    # Case 2: Full Wayback Machine URL
-    if parsed.netloc == DOMAIN:
-        match = re.match(wayback_pattern, parsed.path)
-        if match:
-            original_url = match.group(2)
-            # Ensure the URL has a scheme
-            if not re.match(r'^[a-zA-Z]+://', original_url):
-                original_url = 'http://' + original_url
-            return convert_ftp_to_http(original_url)
-
-    # If it's not a Wayback Machine URL, still convert FTP to HTTP
-    return convert_ftp_to_http(url)
+	# For other cases (like URLs starting with '/'), join with the base URL
+	return urljoin(base_url, url)
 
 def convert_ftp_to_http(url):
 	parsed = urlparse(url)
@@ -115,20 +98,29 @@ def convert_ftp_to_http(url):
 		return urlunparse(new_parsed)
 	return url
 
-def process_html_content(content):
-    soup = BeautifulSoup(content, 'html.parser')
-
-    # Process all links
-    for a in soup.find_all('a', href=True):
-        a['href'] = transform_url(a['href'])
-
-    # Process all images, scripts, and other resources
-    for tag in soup.find_all(['img', 'script', 'link'], src=True):
-        tag['src'] = transform_url(tag['src'])
-    for tag in soup.find_all('link', href=True):
-        tag['href'] = transform_url(tag['href'])
-
-    return str(soup)
+def process_html_content(content, base_url):
+	soup = BeautifulSoup(content, 'html.parser')
+
+	# Process all links
+	for a in soup.find_all('a', href=True):
+		a['href'] = transform_url(a['href'], base_url)
+
+	# Process all images, scripts, and other resources
+	for tag in soup.find_all(['img', 'script', 'link', 'iframe'], src=True):
+		tag['src'] = transform_url(tag['src'], base_url)
+	for tag in soup.find_all('link', href=True):
+		tag['href'] = transform_url(tag['href'], base_url)
+
+	# Handle background images in style attributes
+	for tag in soup.find_all(style=True):
+		style = tag['style']
+		urls = re.findall(r'url\([\'"]?([^\'" \)]+)', style)
+		for url in urls:
+			new_url = transform_url(url, base_url)
+			style = style.replace(url, new_url)
+		tag['style'] = style
+
+	return str(soup)
 
 # def extract_original_url(wayback_url):
 # 	parsed = urlparse(wayback_url)
@@ -208,7 +200,6 @@ def handle_request(req):
 									  selected_year=selected_year,
 									  current_year=current_year,
 									  date_update_message=date_update_message), 200
-
 	# If we're here, override is active and we're handling a non-wayback domain
 	try:
 		print('Handling request for:', req.url)
@@ -235,7 +226,7 @@ def handle_request(req):
 
 		# Fetch the content of the archived page
 		response = requests.get(snapshot.archive_url, headers={'User-Agent': user_agent})
-		content = response.content  # Use content instead of text to handle binary data
+		content = response.content
 		print("Content fetched, length:", len(content))
 
 		# Determine the content type
@@ -254,7 +245,7 @@ def handle_request(req):
 		# Process HTML content
 		if content_type.startswith('text/html'):
 			content = content.decode('utf-8', errors='replace')
-			processed_content = process_html_content(content)
+			processed_content = process_html_content(content, url)
 			return processed_content, response.status_code, {'Content-Type': 'text/html'}
 
 		# For text-based content types, decode and return as string

diff --git a/proxy.py b/proxy.py
@@ -9,6 +9,7 @@
 from PIL import Image
 import hashlib
 import shutil
+import mimetypes
 
 os.environ['FLASK_ENV'] = 'development'
 app = Flask(__name__)
@@ -54,23 +55,46 @@ def clear_image_cache():
 	extensions[ext] = module
 	domain_to_extension[module.DOMAIN] = module
 
+def is_image_url(url):
+	mime_type, _ = mimetypes.guess_type(url)
+	return mime_type and mime_type.startswith('image/')
+
+from PIL import Image
+import io
+
 def optimize_image(image_data):
-	img = Image.open(io.BytesIO(image_data))
-
-	# Calculate the new size while maintaining aspect ratio
-	width, height = img.size
-	if width > MAX_WIDTH or height > MAX_HEIGHT:
-		ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
-		new_size = (int(width * ratio), int(height * ratio))
-		img = img.resize(new_size, Image.LANCZOS)
-
-	# Convert to black and white
-	img = img.convert("1")
-
-	# Save as 1-bit GIF
-	output = io.BytesIO()
-	img.save(output, format="GIF", optimize=True)
-	return output.getvalue()
+    img = Image.open(io.BytesIO(image_data))
+
+    # Convert to RGBA if it's not already
+    if img.mode != 'RGBA':
+        img = img.convert('RGBA')
+
+    # Create a white background
+    background = Image.new('RGBA', img.size, (255, 255, 255, 255))
+
+    # Alpha composite the image onto the background
+    img = Image.alpha_composite(background, img)
+
+    # Convert back to RGB
+    img = img.convert('RGB')
+
+    # Calculate the new size while maintaining aspect ratio
+    width, height = img.size
+    if width > MAX_WIDTH or height > MAX_HEIGHT:
+        ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
+        new_size = (int(width * ratio), int(height * ratio))
+        img = img.resize(new_size, Image.LANCZOS)
+
+    # Convert to grayscale
+    img = img.convert("L")
+
+    # Apply Floyd-Steinberg dithering and convert to 1-bit black and white
+    img = img.convert("1", dither=Image.FLOYDSTEINBERG)
+
+    # Save as 1-bit GIF
+    output = io.BytesIO()
+    img.save(output, format="GIF", optimize=True)
+    return output.getvalue()
 
 def fetch_and_cache_image(url):
 	try:
@@ -117,6 +141,13 @@ def replace_image_urls(content, base_url):
 def serve_cached_image(filename):
 	return send_from_directory(CACHE_DIR, filename, mimetype='image/gif')
 
+def handle_image_request(url):
+	cached_url = fetch_and_cache_image(url)
+	if cached_url:
+		return send_from_directory(CACHE_DIR, os.path.basename(cached_url), mimetype='image/gif')
+	else:
+		return abort(404, "Image not found or could not be processed")
+
 @app.route("/", defaults={"path": "/"}, methods=["GET", "POST"])
 @app.route("/<path:path>", methods=["GET", "POST"])
 def handle_request(path):
@@ -180,6 +211,11 @@ def handle_matching_extension(matching_extension):
 
 def process_response_with_image_caching(response, base_url):
 	print(f"Processing response for URL: {base_url}")
+
+	# Check if the response is for an image URL
+	if is_image_url(base_url):
+		return handle_image_request(base_url)
+
 	if isinstance(response, tuple):
 		if len(response) == 3:
 			content, status_code, headers = response
@@ -201,20 +237,41 @@ def process_response_with_image_caching(response, base_url):
 	content_type = headers.get('Content-Type', '').lower()
 	print(f"Content-Type: {content_type}")
 
-	# Apply image caching for HTML content
-	if content_type.startswith('text/html'):
-		print("Applying image caching to HTML content")
-		if isinstance(content, bytes):
-			content = content.decode('utf-8', errors='replace')
-		content = replace_image_urls(content, base_url)
-		content = transcode_html(content, app.config["DISABLE_CHAR_CONVERSION"])
-	elif content_type.startswith('text/'):
-		print("Processing text content")
+	# List of content types that should not be transcoded
+	non_transcode_types = [
+		'application/octet-stream',
+		'application/pdf',
+		'application/zip',
+		'application/x-zip-compressed',
+		'application/x-rar-compressed',
+		'application/x-tar',
+		'application/x-gzip',
+		'application/x-bzip2',
+		'application/x-7z-compressed',
+		'application/vnd.openxmlformats-officedocument',
+		'application/vnd.ms-excel',
+		'application/vnd.ms-powerpoint',
+		'application/msword',
+		'audio/',
+		'video/',
+	]
+
+	# Check if content type is in the list of non-transcode types
+	should_transcode = not any(content_type.startswith(t) for t in non_transcode_types)
+
+	if should_transcode:
+		print("Transcoding content")
 		if isinstance(content, bytes):
 			content = content.decode('utf-8', errors='replace')
+
+		# Apply image caching for HTML-like content
+		if content_type.startswith('text/html') or '<html' in content.lower():
+			print("Applying image caching to HTML content")
+			content = replace_image_urls(content, base_url)
+
 		content = transcode_html(content, app.config["DISABLE_CHAR_CONVERSION"])
 	else:
-		print(f"Content is not text ({content_type}), passing through unchanged")
+		print(f"Content type {content_type} should not be transcoded, passing through unchanged")
 
 	response = Response(content, status_code)
 	for key, value in headers.items():
@@ -230,6 +287,9 @@ def handle_default_request():
 	print(f"Handling default request for URL: {url}")
 
 	try:
+		if is_image_url(url):
+			return handle_image_request(url)
+
 		resp = send_request(url, headers)
 		content = resp.content
 		status_code = resp.status_code