From 7a95f685fddd6e4be25418c9574adb369d0b6c37 Mon Sep 17 00:00:00 2001 From: Hunter Date: Mon, 9 Sep 2024 14:22:22 -0400 Subject: [PATCH] implement more robust image converting/caching --- extensions/hunterirving/hunterirving.py | 9 +- extensions/reddit/reddit.py | 8 +- extensions/waybackmachine/waybackmachine.py | 82 ++++++++---- extensions/wiby/wiby.py | 21 ++-- image_utils.py | 65 ++++++++++ proxy.py | 133 ++++---------------- 6 files changed, 167 insertions(+), 151 deletions(-) create mode 100644 image_utils.py diff --git a/extensions/hunterirving/hunterirving.py b/extensions/hunterirving/hunterirving.py index f70cb1b..89b4ba6 100644 --- a/extensions/hunterirving/hunterirving.py +++ b/extensions/hunterirving/hunterirving.py @@ -2,6 +2,7 @@ import requests from bs4 import BeautifulSoup from datetime import datetime, timedelta +import mimetypes DOMAIN = "hunterirving.com" @@ -33,7 +34,13 @@ def handle_request(req): response = requests.get(url) response.raise_for_status() # Raise an exception for bad status codes - # Try to decode with UTF-8 first, then fall back to ISO-8859-1 + # Check if the content is an image + content_type = response.headers.get('Content-Type', '') + if content_type.startswith('image/'): + # For images, return the content as-is + return response.content, response.status_code, {'Content-Type': content_type} + + # For non-image content, proceed with HTML processing try: html_content = response.content.decode('utf-8') except UnicodeDecodeError: diff --git a/extensions/reddit/reddit.py b/extensions/reddit/reddit.py index c9169c6..7889ae8 100644 --- a/extensions/reddit/reddit.py +++ b/extensions/reddit/reddit.py @@ -10,6 +10,11 @@ import mimetypes DOMAIN = "reddit.com" +USER_AGENT = None + +def set_user_agent(user_agent): + global USER_AGENT + USER_AGENT = user_agent def handle_request(request): if request.method != 'GET': @@ -21,7 +26,8 @@ def handle_request(request): url = url.replace("reddit.com", "old.reddit.com", 1) try: - resp = requests.get(url, allow_redirects=True, timeout=10) + headers = {'User-Agent': USER_AGENT} if USER_AGENT else {} + resp = requests.get(url, headers=headers, allow_redirects=True, timeout=10) resp.raise_for_status() return process_content(resp.content, url) except requests.RequestException as e: diff --git a/extensions/waybackmachine/waybackmachine.py b/extensions/waybackmachine/waybackmachine.py index d7267f1..31bedbb 100644 --- a/extensions/waybackmachine/waybackmachine.py +++ b/extensions/waybackmachine/waybackmachine.py @@ -1,5 +1,5 @@ from flask import request, render_template_string -from urllib.parse import urlparse, urlunparse +from urllib.parse import urlparse, urlunparse, urljoin from waybackpy import WaybackMachineCDXServerAPI import requests from bs4 import BeautifulSoup @@ -8,7 +8,6 @@ import re import mimetypes import os -from urllib.parse import urljoin DOMAIN = "web.archive.org" TARGET_DATE = "19960101" @@ -73,22 +72,22 @@ def get_override_status(): global override_active return override_active -def transform_url(url, base_url): - # If the URL is absolute, return it as is - if url.startswith(('http://', 'https://')): - return url +# def transform_url(url, base_url): +# # If the URL is absolute, return it as is +# if url.startswith(('http://', 'https://')): +# return url - # If the URL starts with '/web/', it's a wayback machine URL - if url.startswith('/web/'): - return f'http://{DOMAIN}{url}' +# # If the URL starts with '/web/', it's a wayback machine URL +# if url.startswith('/web/'): +# return f'http://{DOMAIN}{url}' - # If it's a relative URL - if not url.startswith('/'): - # Join it with the base URL - return urljoin(base_url, url) +# # If it's a relative URL +# if not url.startswith('/'): +# # Join it with the base URL +# return urljoin(base_url, url) - # For other cases (like URLs starting with '/'), join with the base URL - return urljoin(base_url, url) +# # For other cases (like URLs starting with '/'), join with the base URL +# return urljoin(base_url, url) def convert_ftp_to_http(url): parsed = urlparse(url) @@ -103,32 +102,59 @@ def process_html_content(content, base_url): # Process all links for a in soup.find_all('a', href=True): - a['href'] = transform_url(a['href'], base_url) + a['href'] = extract_original_url(a['href'], base_url) # Process all images, scripts, and other resources for tag in soup.find_all(['img', 'script', 'link', 'iframe'], src=True): - tag['src'] = transform_url(tag['src'], base_url) + tag['src'] = extract_original_url(tag['src'], base_url) for tag in soup.find_all('link', href=True): - tag['href'] = transform_url(tag['href'], base_url) + tag['href'] = extract_original_url(tag['href'], base_url) # Handle background images in style attributes for tag in soup.find_all(style=True): style = tag['style'] urls = re.findall(r'url\([\'"]?([^\'" \)]+)', style) for url in urls: - new_url = transform_url(url, base_url) + new_url = extract_original_url(url, base_url) style = style.replace(url, new_url) tag['style'] = style return str(soup) -# def extract_original_url(wayback_url): -# parsed = urlparse(wayback_url) -# if parsed.netloc == DOMAIN and '/web/' in parsed.path: -# path_parts = parsed.path.split('/', 3) -# if len(path_parts) >= 4: -# return 'http://' + path_parts[3] -# return wayback_url +def extract_original_url(url, base_url): + # Parse the base_url to extract the original domain and path + parsed_base = urlparse(base_url) + original_domain = parsed_base.netloc.split(':', 1)[0] # Remove port if present + original_path = '' + + if original_domain == DOMAIN: + # Extract the original URL from the Wayback Machine URL + parts = parsed_base.path.split('/', 4) + if len(parts) >= 5: + original_domain = parts[3] + original_path = '/'.join(parts[4:]).rsplit('/', 1)[0] + else: + # If it's not a Wayback Machine URL, use the path from parsed_base + original_path = '/'.join(parsed_base.path.split('/')[:-1]) # Remove the file name from the path + + # Case 1: If the URL is already absolute and not a Wayback Machine URL, return it as is + if url.startswith(('http://', 'https://')) and DOMAIN not in url: + return url + + # Case 2: If it's a Wayback Machine URL, extract the original URL + if url.startswith(('/web/', f'http://{DOMAIN}/web/', f'https://{DOMAIN}/web/')): + parts = url.split('/', 5) + if len(parts) >= 6: + return f'http://{parts[5]}' + return url + + # Case 3: If it's a root-relative URL (starts with '/') + if url.startswith('/') and not url.startswith('/web/'): + return f'http://{original_domain}{url}' + + # Case 4: For relative URLs, join with the original domain and path + full_base = f'http://{original_domain}{original_path}/' + return urljoin(full_base, url) def get_mime_type(url): # Get the file extension @@ -200,6 +226,7 @@ def handle_request(req): selected_year=selected_year, current_year=current_year, date_update_message=date_update_message), 200 + # If we're here, override is active and we're handling a non-wayback domain try: print('Handling request for:', req.url) @@ -232,6 +259,9 @@ def handle_request(req): # Determine the content type content_type = response.headers.get('Content-Type', '').split(';')[0].strip() + if content_type.startswith('image/'): + return content, response.status_code, {'Content-Type': content_type} + if not content_type: # If no content type is provided, guess based on the URL content_type, _ = mimetypes.guess_type(url) diff --git a/extensions/wiby/wiby.py b/extensions/wiby/wiby.py index 5846c50..5be0658 100644 --- a/extensions/wiby/wiby.py +++ b/extensions/wiby/wiby.py @@ -10,14 +10,8 @@ def handle_request(request): return handle_surprise(request) else: url = request.url.replace("https://", "http://", 1) - headers = { - "Accept": request.headers.get("Accept"), - "Accept-Language": request.headers.get("Accept-Language"), - "Referer": request.headers.get("Referer"), - "User-Agent": request.headers.get("User-Agent"), - } - resp = requests.get(url, headers=headers) + resp = requests.get(url) # If it's the homepage, modify the page structure if url == "http://wiby.me" or url == "http://wiby.me/": @@ -42,7 +36,9 @@ def get_final_surprise_url(): if resp.status_code in (301, 302, 303, 307, 308): url = urljoin(url, resp.headers['Location']) redirects += 1 - elif resp.status_code == 200: + continue + + if resp.status_code == 200: soup = BeautifulSoup(resp.content, 'html.parser') meta_tag = soup.find("meta", attrs={"http-equiv": "refresh"}) @@ -52,12 +48,9 @@ def get_final_surprise_url(): if len(parts) > 1: url = urljoin(url, parts[1].strip("'\"")) redirects += 1 - else: - return url - else: - return url - else: - return url + continue + + return url return url diff --git a/image_utils.py b/image_utils.py new file mode 100644 index 0000000..92b7521 --- /dev/null +++ b/image_utils.py @@ -0,0 +1,65 @@ +import os +import io +import hashlib +import requests +from PIL import Image +import mimetypes + +CACHE_DIR = os.path.join(os.path.dirname(__file__), "cached_images") +MAX_WIDTH = 512 +MAX_HEIGHT = 342 +USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36" + +def is_image_url(url): + mime_type, _ = mimetypes.guess_type(url) + return mime_type and mime_type.startswith('image/') + +def optimize_image(image_data): + img = Image.open(io.BytesIO(image_data)) + + if img.mode != 'RGBA': + img = img.convert('RGBA') + + background = Image.new('RGBA', img.size, (255, 255, 255, 255)) + img = Image.alpha_composite(background, img) + img = img.convert('RGB') + + width, height = img.size + if width > MAX_WIDTH or height > MAX_HEIGHT: + ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height) + new_size = (int(width * ratio), int(height * ratio)) + img = img.resize(new_size, Image.LANCZOS) + + img = img.convert("L") + img = img.convert("1", dither=Image.FLOYDSTEINBERG) + + output = io.BytesIO() + img.save(output, format="GIF", optimize=True) + return output.getvalue() + +def fetch_and_cache_image(url, content=None): + try: + print(f"Processing image: {url}") + + file_name = hashlib.md5(url.encode()).hexdigest() + ".gif" + file_path = os.path.join(CACHE_DIR, file_name) + + if not os.path.exists(file_path): + print(f"Optimizing and caching image: {url}") + if content is None: + response = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) + response.raise_for_status() + content = response.content + + optimized_image = optimize_image(content) + with open(file_path, 'wb') as f: + f.write(optimized_image) + else: + print(f"Image already cached: {url}") + + cached_url = f"/cached_image/{file_name}" + print(f"Cached URL: {cached_url}") + return cached_url + except Exception as e: + print(f"Error processing image: {url}, Error: {str(e)}") + return None \ No newline at end of file diff --git a/proxy.py b/proxy.py index 9d213d7..ce2c37a 100644 --- a/proxy.py +++ b/proxy.py @@ -10,6 +10,7 @@ import hashlib import shutil import mimetypes +from image_utils import is_image_url, fetch_and_cache_image, CACHE_DIR os.environ['FLASK_ENV'] = 'development' app = Flask(__name__) @@ -21,11 +22,6 @@ # Global variable to store the override extension override_extension = None -# Global variables for image caching -CACHE_DIR = os.path.join(os.path.dirname(__file__), "cached_images") -MAX_WIDTH = 512 -MAX_HEIGHT = 342 - # User-Agent string USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36" @@ -55,88 +51,6 @@ def clear_image_cache(): extensions[ext] = module domain_to_extension[module.DOMAIN] = module -def is_image_url(url): - mime_type, _ = mimetypes.guess_type(url) - return mime_type and mime_type.startswith('image/') - -from PIL import Image -import io - -def optimize_image(image_data): - img = Image.open(io.BytesIO(image_data)) - - # Convert to RGBA if it's not already - if img.mode != 'RGBA': - img = img.convert('RGBA') - - # Create a white background - background = Image.new('RGBA', img.size, (255, 255, 255, 255)) - - # Alpha composite the image onto the background - img = Image.alpha_composite(background, img) - - # Convert back to RGB - img = img.convert('RGB') - - # Calculate the new size while maintaining aspect ratio - width, height = img.size - if width > MAX_WIDTH or height > MAX_HEIGHT: - ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height) - new_size = (int(width * ratio), int(height * ratio)) - img = img.resize(new_size, Image.LANCZOS) - - # Convert to grayscale - img = img.convert("L") - - # Apply Floyd-Steinberg dithering and convert to 1-bit black and white - img = img.convert("1", dither=Image.FLOYDSTEINBERG) - - # Save as 1-bit GIF - output = io.BytesIO() - img.save(output, format="GIF", optimize=True) - return output.getvalue() - -def fetch_and_cache_image(url): - try: - print(f"Fetching image: {url}") - response = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) - response.raise_for_status() - - # Generate a unique filename based on the URL - file_name = hashlib.md5(url.encode()).hexdigest() + ".gif" - file_path = os.path.join(CACHE_DIR, file_name) - - # If the file doesn't exist, optimize and cache it - if not os.path.exists(file_path): - print(f"Optimizing and caching image: {url}") - optimized_image = optimize_image(response.content) - with open(file_path, 'wb') as f: - f.write(optimized_image) - else: - print(f"Image already cached: {url}") - - cached_url = f"/cached_image/{file_name}" - print(f"Cached URL: {cached_url}") - return cached_url - except Exception as e: - print(f"Error processing image: {url}, Error: {str(e)}") - return None - -def replace_image_urls(content, base_url): - soup = BeautifulSoup(content, 'html.parser') - for img in soup.find_all('img'): - src = img.get('src') - if src: - full_url = urljoin(base_url, src) - print(f"Processing image: {full_url}") - cached_url = fetch_and_cache_image(full_url) - if cached_url: - img['src'] = cached_url - print(f"Replaced image URL: {src} -> {cached_url}") - else: - print(f"Failed to cache image: {full_url}") - return str(soup) - @app.route("/cached_image/") def serve_cached_image(filename): return send_from_directory(CACHE_DIR, filename, mimetype='image/gif') @@ -161,11 +75,16 @@ def handle_request(path): override_response = handle_override_extension(scheme) if override_response is not None: - return process_response_with_image_caching(override_response, request.url) + return process_response(override_response, request.url) matching_extension = find_matching_extension(host) if matching_extension: - return handle_matching_extension(matching_extension) + response = handle_matching_extension(matching_extension) + return process_response(response, request.url) + + # Only handle image requests here if we're not using an extension + if is_image_url(request.url) and not (override_extension or matching_extension): + return handle_image_request(request.url) return handle_default_request() @@ -177,7 +96,7 @@ def handle_override_extension(scheme): if scheme in ['http', 'https', 'ftp']: response = extensions[extension_name].handle_request(request) check_override_status(extension_name) - return process_response_with_image_caching(response, request.url) + return process_response(response, request.url) else: print(f"Warning: Unsupported scheme '{scheme}' for override extension.") else: @@ -206,16 +125,12 @@ def handle_matching_extension(matching_extension): override_extension = matching_extension.__name__ print(f"Override enabled for {override_extension}") - # Use the original request URL as the base URL - return process_response_with_image_caching(response, request.url) + # Return the response directly + return response + +def process_response(response, url): + print(f"Processing response for URL: {url}") -def process_response_with_image_caching(response, base_url): - print(f"Processing response for URL: {base_url}") - - # Check if the response is for an image URL - if is_image_url(base_url): - return handle_image_request(base_url) - if isinstance(response, tuple): if len(response) == 3: content, status_code, headers = response @@ -227,7 +142,6 @@ def process_response_with_image_caching(response, base_url): status_code = 200 headers = {} elif isinstance(response, Response): - print("Response is already a Flask Response object") return response else: content = response @@ -237,6 +151,14 @@ def process_response_with_image_caching(response, base_url): content_type = headers.get('Content-Type', '').lower() print(f"Content-Type: {content_type}") + if content_type.startswith('image/'): + # For image content, use the fetch_and_cache_image function + cached_url = fetch_and_cache_image(url, content) + if cached_url: + return send_from_directory(CACHE_DIR, os.path.basename(cached_url), mimetype='image/gif') + else: + return abort(404, "Image could not be processed") + # List of content types that should not be transcoded non_transcode_types = [ 'application/octet-stream', @@ -264,11 +186,6 @@ def process_response_with_image_caching(response, base_url): if isinstance(content, bytes): content = content.decode('utf-8', errors='replace') - # Apply image caching for HTML-like content - if content_type.startswith('text/html') or '