Skip to content

Commit

Permalink
implement more robust image converting/caching
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterirving committed Sep 9, 2024
1 parent 95f8be6 commit 7a95f68
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 151 deletions.
9 changes: 8 additions & 1 deletion extensions/hunterirving/hunterirving.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import mimetypes

DOMAIN = "hunterirving.com"

Expand Down Expand Up @@ -33,7 +34,13 @@ def handle_request(req):
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes

# Try to decode with UTF-8 first, then fall back to ISO-8859-1
# Check if the content is an image
content_type = response.headers.get('Content-Type', '')
if content_type.startswith('image/'):
# For images, return the content as-is
return response.content, response.status_code, {'Content-Type': content_type}

# For non-image content, proceed with HTML processing
try:
html_content = response.content.decode('utf-8')
except UnicodeDecodeError:
Expand Down
8 changes: 7 additions & 1 deletion extensions/reddit/reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
import mimetypes

DOMAIN = "reddit.com"
USER_AGENT = None

def set_user_agent(user_agent):
global USER_AGENT
USER_AGENT = user_agent

def handle_request(request):
if request.method != 'GET':
Expand All @@ -21,7 +26,8 @@ def handle_request(request):
url = url.replace("reddit.com", "old.reddit.com", 1)

try:
resp = requests.get(url, allow_redirects=True, timeout=10)
headers = {'User-Agent': USER_AGENT} if USER_AGENT else {}
resp = requests.get(url, headers=headers, allow_redirects=True, timeout=10)
resp.raise_for_status()
return process_content(resp.content, url)
except requests.RequestException as e:
Expand Down
82 changes: 56 additions & 26 deletions extensions/waybackmachine/waybackmachine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from flask import request, render_template_string
from urllib.parse import urlparse, urlunparse
from urllib.parse import urlparse, urlunparse, urljoin
from waybackpy import WaybackMachineCDXServerAPI
import requests
from bs4 import BeautifulSoup
Expand All @@ -8,7 +8,6 @@
import re
import mimetypes
import os
from urllib.parse import urljoin

DOMAIN = "web.archive.org"
TARGET_DATE = "19960101"
Expand Down Expand Up @@ -73,22 +72,22 @@ def get_override_status():
global override_active
return override_active

def transform_url(url, base_url):
# If the URL is absolute, return it as is
if url.startswith(('http://', 'https://')):
return url
# def transform_url(url, base_url):
# # If the URL is absolute, return it as is
# if url.startswith(('http://', 'https://')):
# return url

# If the URL starts with '/web/', it's a wayback machine URL
if url.startswith('/web/'):
return f'http://{DOMAIN}{url}'
# # If the URL starts with '/web/', it's a wayback machine URL
# if url.startswith('/web/'):
# return f'http://{DOMAIN}{url}'

# If it's a relative URL
if not url.startswith('/'):
# Join it with the base URL
return urljoin(base_url, url)
# # If it's a relative URL
# if not url.startswith('/'):
# # Join it with the base URL
# return urljoin(base_url, url)

# For other cases (like URLs starting with '/'), join with the base URL
return urljoin(base_url, url)
# # For other cases (like URLs starting with '/'), join with the base URL
# return urljoin(base_url, url)

def convert_ftp_to_http(url):
parsed = urlparse(url)
Expand All @@ -103,32 +102,59 @@ def process_html_content(content, base_url):

# Process all links
for a in soup.find_all('a', href=True):
a['href'] = transform_url(a['href'], base_url)
a['href'] = extract_original_url(a['href'], base_url)

# Process all images, scripts, and other resources
for tag in soup.find_all(['img', 'script', 'link', 'iframe'], src=True):
tag['src'] = transform_url(tag['src'], base_url)
tag['src'] = extract_original_url(tag['src'], base_url)
for tag in soup.find_all('link', href=True):
tag['href'] = transform_url(tag['href'], base_url)
tag['href'] = extract_original_url(tag['href'], base_url)

# Handle background images in style attributes
for tag in soup.find_all(style=True):
style = tag['style']
urls = re.findall(r'url\([\'"]?([^\'" \)]+)', style)
for url in urls:
new_url = transform_url(url, base_url)
new_url = extract_original_url(url, base_url)
style = style.replace(url, new_url)
tag['style'] = style

return str(soup)

# def extract_original_url(wayback_url):
# parsed = urlparse(wayback_url)
# if parsed.netloc == DOMAIN and '/web/' in parsed.path:
# path_parts = parsed.path.split('/', 3)
# if len(path_parts) >= 4:
# return 'http://' + path_parts[3]
# return wayback_url
def extract_original_url(url, base_url):
# Parse the base_url to extract the original domain and path
parsed_base = urlparse(base_url)
original_domain = parsed_base.netloc.split(':', 1)[0] # Remove port if present
original_path = ''

if original_domain == DOMAIN:
# Extract the original URL from the Wayback Machine URL
parts = parsed_base.path.split('/', 4)
if len(parts) >= 5:
original_domain = parts[3]
original_path = '/'.join(parts[4:]).rsplit('/', 1)[0]
else:
# If it's not a Wayback Machine URL, use the path from parsed_base
original_path = '/'.join(parsed_base.path.split('/')[:-1]) # Remove the file name from the path

# Case 1: If the URL is already absolute and not a Wayback Machine URL, return it as is
if url.startswith(('http://', 'https://')) and DOMAIN not in url:
return url

# Case 2: If it's a Wayback Machine URL, extract the original URL
if url.startswith(('/web/', f'http://{DOMAIN}/web/', f'https://{DOMAIN}/web/')):
parts = url.split('/', 5)
if len(parts) >= 6:
return f'http://{parts[5]}'
return url

# Case 3: If it's a root-relative URL (starts with '/')
if url.startswith('/') and not url.startswith('/web/'):
return f'http://{original_domain}{url}'

# Case 4: For relative URLs, join with the original domain and path
full_base = f'http://{original_domain}{original_path}/'
return urljoin(full_base, url)

def get_mime_type(url):
# Get the file extension
Expand Down Expand Up @@ -200,6 +226,7 @@ def handle_request(req):
selected_year=selected_year,
current_year=current_year,
date_update_message=date_update_message), 200

# If we're here, override is active and we're handling a non-wayback domain
try:
print('Handling request for:', req.url)
Expand Down Expand Up @@ -232,6 +259,9 @@ def handle_request(req):
# Determine the content type
content_type = response.headers.get('Content-Type', '').split(';')[0].strip()

if content_type.startswith('image/'):
return content, response.status_code, {'Content-Type': content_type}

if not content_type:
# If no content type is provided, guess based on the URL
content_type, _ = mimetypes.guess_type(url)
Expand Down
21 changes: 7 additions & 14 deletions extensions/wiby/wiby.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,8 @@ def handle_request(request):
return handle_surprise(request)
else:
url = request.url.replace("https://", "http://", 1)
headers = {
"Accept": request.headers.get("Accept"),
"Accept-Language": request.headers.get("Accept-Language"),
"Referer": request.headers.get("Referer"),
"User-Agent": request.headers.get("User-Agent"),
}

resp = requests.get(url, headers=headers)
resp = requests.get(url)

# If it's the homepage, modify the page structure
if url == "http://wiby.me" or url == "http://wiby.me/":
Expand All @@ -42,7 +36,9 @@ def get_final_surprise_url():
if resp.status_code in (301, 302, 303, 307, 308):
url = urljoin(url, resp.headers['Location'])
redirects += 1
elif resp.status_code == 200:
continue

if resp.status_code == 200:
soup = BeautifulSoup(resp.content, 'html.parser')
meta_tag = soup.find("meta", attrs={"http-equiv": "refresh"})

Expand All @@ -52,12 +48,9 @@ def get_final_surprise_url():
if len(parts) > 1:
url = urljoin(url, parts[1].strip("'\""))
redirects += 1
else:
return url
else:
return url
else:
return url
continue

return url

return url

Expand Down
65 changes: 65 additions & 0 deletions image_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import io
import hashlib
import requests
from PIL import Image
import mimetypes

CACHE_DIR = os.path.join(os.path.dirname(__file__), "cached_images")
MAX_WIDTH = 512
MAX_HEIGHT = 342
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"

def is_image_url(url):
mime_type, _ = mimetypes.guess_type(url)
return mime_type and mime_type.startswith('image/')

def optimize_image(image_data):
img = Image.open(io.BytesIO(image_data))

if img.mode != 'RGBA':
img = img.convert('RGBA')

background = Image.new('RGBA', img.size, (255, 255, 255, 255))
img = Image.alpha_composite(background, img)
img = img.convert('RGB')

width, height = img.size
if width > MAX_WIDTH or height > MAX_HEIGHT:
ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
new_size = (int(width * ratio), int(height * ratio))
img = img.resize(new_size, Image.LANCZOS)

img = img.convert("L")
img = img.convert("1", dither=Image.FLOYDSTEINBERG)

output = io.BytesIO()
img.save(output, format="GIF", optimize=True)
return output.getvalue()

def fetch_and_cache_image(url, content=None):
try:
print(f"Processing image: {url}")

file_name = hashlib.md5(url.encode()).hexdigest() + ".gif"
file_path = os.path.join(CACHE_DIR, file_name)

if not os.path.exists(file_path):
print(f"Optimizing and caching image: {url}")
if content is None:
response = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT})
response.raise_for_status()
content = response.content

optimized_image = optimize_image(content)
with open(file_path, 'wb') as f:
f.write(optimized_image)
else:
print(f"Image already cached: {url}")

cached_url = f"/cached_image/{file_name}"
print(f"Cached URL: {cached_url}")
return cached_url
except Exception as e:
print(f"Error processing image: {url}, Error: {str(e)}")
return None
Loading

0 comments on commit 7a95f68

Please sign in to comment.