Skip to content

Commit

Permalink
porting image conversion out of wayback
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterirving committed Sep 9, 2024
1 parent 8823bb7 commit 95f8be6
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 150 deletions.
82 changes: 6 additions & 76 deletions extensions/reddit/reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,86 +10,18 @@
import mimetypes

DOMAIN = "reddit.com"
CACHE_DIR = os.path.join(os.path.dirname(__file__), "cached_images")
image_counter = 0
MAX_WIDTH = 512
MAX_HEIGHT = 342

def clear_image_cache():
global image_counter
if os.path.exists(CACHE_DIR):
shutil.rmtree(CACHE_DIR)
os.makedirs(CACHE_DIR, exist_ok=True)
image_counter = 0

# Call this function when the extension is loaded
clear_image_cache()

def optimize_image(image_data):
img = Image.open(io.BytesIO(image_data))

# Calculate the new size while maintaining aspect ratio
width, height = img.size
if width > MAX_WIDTH or height > MAX_HEIGHT:
ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
new_size = (int(width * ratio), int(height * ratio))
img = img.resize(new_size, Image.LANCZOS)

# Convert to black and white
img = img.convert("1")

# Save as 1-bit GIF
output = io.BytesIO()
img.save(output, format="GIF", optimize=True)
return output.getvalue()

def fetch_and_cache_image(url):
global image_counter
try:
response = requests.get(url, stream=True)
response.raise_for_status()

# Optimize the image
optimized_image = optimize_image(response.content)

# Increment the counter and use it for the filename
image_counter += 1
file_name = f"img_{image_counter:04d}.gif"
file_path = os.path.join(CACHE_DIR, file_name)

with open(file_path, 'wb') as f:
f.write(optimized_image)

return f"http://reddit.com/cached_image/{file_name}"
except Exception as e:
print(f"Error processing image: {str(e)}")
return None

def handle_request(request):
if request.method != 'GET':
return Response("Only GET requests are supported", status=405)

url = request.url
if url.startswith("http://reddit.com/cached_image/"):
file_name = url.split("/")[-1]
file_path = os.path.join(CACHE_DIR, file_name)
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
return Response(f.read(), mimetype='image/gif')
else:
return Response("Image not found", status=404)


if not url.startswith(('http://old.reddit.com', 'https://old.reddit.com')):
url = url.replace("reddit.com", "old.reddit.com", 1)

headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}

try:
resp = requests.get(url, headers=headers, allow_redirects=True, timeout=10)
resp = requests.get(url, allow_redirects=True, timeout=10)
resp.raise_for_status()
return process_content(resp.content, url)
except requests.RequestException as e:
Expand Down Expand Up @@ -201,12 +133,10 @@ def process_content(content, url):
enclosing_a = img.find_parent('a')
if enclosing_a and enclosing_a.has_attr('href'):
img_src = enclosing_a['href']
cached_url = fetch_and_cache_image(img_src)
if cached_url:
new_img = new_soup.new_tag('img', src=cached_url, width="50", height="40")
d.append(new_img)
d.append(" ") # Add space between images

new_img = new_soup.new_tag('img', src=img_src, width="50", height="40")
d.append(new_img)
d.append(" ") # Add space between images

# Add post content if it exists
usertext_body = thing.find('div', class_='usertext-body')
if usertext_body:
Expand Down
87 changes: 39 additions & 48 deletions extensions/waybackmachine/waybackmachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re
import mimetypes
import os
from urllib.parse import urljoin

DOMAIN = "web.archive.org"
TARGET_DATE = "19960101"
Expand Down Expand Up @@ -72,40 +73,22 @@ def get_override_status():
global override_active
return override_active

def transform_url(url):
# If the URL is relative and starts with '/web/', prepend the DOMAIN
if url.startswith('/web/'):
return f'http://{DOMAIN}{url}'
def transform_url(url, base_url):
# If the URL is absolute, return it as is
if url.startswith(('http://', 'https://')):
return url

# If the URL is relative (doesn't start with a scheme or '/'), leave it as is
if not re.match(r'^[a-zA-Z]+://|^/', url):
return url
# If the URL starts with '/web/', it's a wayback machine URL
if url.startswith('/web/'):
return f'http://{DOMAIN}{url}'

# Parse the URL
parsed = urlparse(url)
# If it's a relative URL
if not url.startswith('/'):
# Join it with the base URL
return urljoin(base_url, url)

# Regular expression to match Wayback Machine URL pattern
wayback_pattern = r'^/web/(\d{14})/(.+)'

# Case 1: URL starts with "/web/" followed by 14 digits
if parsed.path.startswith('/web/'):
match = re.match(wayback_pattern, parsed.path)
if match:
original_url = match.group(2)
return convert_ftp_to_http(original_url)

# Case 2: Full Wayback Machine URL
if parsed.netloc == DOMAIN:
match = re.match(wayback_pattern, parsed.path)
if match:
original_url = match.group(2)
# Ensure the URL has a scheme
if not re.match(r'^[a-zA-Z]+://', original_url):
original_url = 'http://' + original_url
return convert_ftp_to_http(original_url)

# If it's not a Wayback Machine URL, still convert FTP to HTTP
return convert_ftp_to_http(url)
# For other cases (like URLs starting with '/'), join with the base URL
return urljoin(base_url, url)

def convert_ftp_to_http(url):
parsed = urlparse(url)
Expand All @@ -115,20 +98,29 @@ def convert_ftp_to_http(url):
return urlunparse(new_parsed)
return url

def process_html_content(content):
soup = BeautifulSoup(content, 'html.parser')

# Process all links
for a in soup.find_all('a', href=True):
a['href'] = transform_url(a['href'])

# Process all images, scripts, and other resources
for tag in soup.find_all(['img', 'script', 'link'], src=True):
tag['src'] = transform_url(tag['src'])
for tag in soup.find_all('link', href=True):
tag['href'] = transform_url(tag['href'])

return str(soup)
def process_html_content(content, base_url):
soup = BeautifulSoup(content, 'html.parser')

# Process all links
for a in soup.find_all('a', href=True):
a['href'] = transform_url(a['href'], base_url)

# Process all images, scripts, and other resources
for tag in soup.find_all(['img', 'script', 'link', 'iframe'], src=True):
tag['src'] = transform_url(tag['src'], base_url)
for tag in soup.find_all('link', href=True):
tag['href'] = transform_url(tag['href'], base_url)

# Handle background images in style attributes
for tag in soup.find_all(style=True):
style = tag['style']
urls = re.findall(r'url\([\'"]?([^\'" \)]+)', style)
for url in urls:
new_url = transform_url(url, base_url)
style = style.replace(url, new_url)
tag['style'] = style

return str(soup)

# def extract_original_url(wayback_url):
# parsed = urlparse(wayback_url)
Expand Down Expand Up @@ -208,7 +200,6 @@ def handle_request(req):
selected_year=selected_year,
current_year=current_year,
date_update_message=date_update_message), 200

# If we're here, override is active and we're handling a non-wayback domain
try:
print('Handling request for:', req.url)
Expand All @@ -235,7 +226,7 @@ def handle_request(req):

# Fetch the content of the archived page
response = requests.get(snapshot.archive_url, headers={'User-Agent': user_agent})
content = response.content # Use content instead of text to handle binary data
content = response.content
print("Content fetched, length:", len(content))

# Determine the content type
Expand All @@ -254,7 +245,7 @@ def handle_request(req):
# Process HTML content
if content_type.startswith('text/html'):
content = content.decode('utf-8', errors='replace')
processed_content = process_html_content(content)
processed_content = process_html_content(content, url)
return processed_content, response.status_code, {'Content-Type': 'text/html'}

# For text-based content types, decode and return as string
Expand Down
112 changes: 86 additions & 26 deletions proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from PIL import Image
import hashlib
import shutil
import mimetypes

os.environ['FLASK_ENV'] = 'development'
app = Flask(__name__)
Expand Down Expand Up @@ -54,23 +55,46 @@ def clear_image_cache():
extensions[ext] = module
domain_to_extension[module.DOMAIN] = module

def is_image_url(url):
mime_type, _ = mimetypes.guess_type(url)
return mime_type and mime_type.startswith('image/')

from PIL import Image
import io

def optimize_image(image_data):
img = Image.open(io.BytesIO(image_data))

# Calculate the new size while maintaining aspect ratio
width, height = img.size
if width > MAX_WIDTH or height > MAX_HEIGHT:
ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
new_size = (int(width * ratio), int(height * ratio))
img = img.resize(new_size, Image.LANCZOS)

# Convert to black and white
img = img.convert("1")

# Save as 1-bit GIF
output = io.BytesIO()
img.save(output, format="GIF", optimize=True)
return output.getvalue()
img = Image.open(io.BytesIO(image_data))

# Convert to RGBA if it's not already
if img.mode != 'RGBA':
img = img.convert('RGBA')

# Create a white background
background = Image.new('RGBA', img.size, (255, 255, 255, 255))

# Alpha composite the image onto the background
img = Image.alpha_composite(background, img)

# Convert back to RGB
img = img.convert('RGB')

# Calculate the new size while maintaining aspect ratio
width, height = img.size
if width > MAX_WIDTH or height > MAX_HEIGHT:
ratio = min(MAX_WIDTH / width, MAX_HEIGHT / height)
new_size = (int(width * ratio), int(height * ratio))
img = img.resize(new_size, Image.LANCZOS)

# Convert to grayscale
img = img.convert("L")

# Apply Floyd-Steinberg dithering and convert to 1-bit black and white
img = img.convert("1", dither=Image.FLOYDSTEINBERG)

# Save as 1-bit GIF
output = io.BytesIO()
img.save(output, format="GIF", optimize=True)
return output.getvalue()

def fetch_and_cache_image(url):
try:
Expand Down Expand Up @@ -117,6 +141,13 @@ def replace_image_urls(content, base_url):
def serve_cached_image(filename):
return send_from_directory(CACHE_DIR, filename, mimetype='image/gif')

def handle_image_request(url):
cached_url = fetch_and_cache_image(url)
if cached_url:
return send_from_directory(CACHE_DIR, os.path.basename(cached_url), mimetype='image/gif')
else:
return abort(404, "Image not found or could not be processed")

@app.route("/", defaults={"path": "/"}, methods=["GET", "POST"])
@app.route("/<path:path>", methods=["GET", "POST"])
def handle_request(path):
Expand Down Expand Up @@ -180,6 +211,11 @@ def handle_matching_extension(matching_extension):

def process_response_with_image_caching(response, base_url):
print(f"Processing response for URL: {base_url}")

# Check if the response is for an image URL
if is_image_url(base_url):
return handle_image_request(base_url)

if isinstance(response, tuple):
if len(response) == 3:
content, status_code, headers = response
Expand All @@ -201,20 +237,41 @@ def process_response_with_image_caching(response, base_url):
content_type = headers.get('Content-Type', '').lower()
print(f"Content-Type: {content_type}")

# Apply image caching for HTML content
if content_type.startswith('text/html'):
print("Applying image caching to HTML content")
if isinstance(content, bytes):
content = content.decode('utf-8', errors='replace')
content = replace_image_urls(content, base_url)
content = transcode_html(content, app.config["DISABLE_CHAR_CONVERSION"])
elif content_type.startswith('text/'):
print("Processing text content")
# List of content types that should not be transcoded
non_transcode_types = [
'application/octet-stream',
'application/pdf',
'application/zip',
'application/x-zip-compressed',
'application/x-rar-compressed',
'application/x-tar',
'application/x-gzip',
'application/x-bzip2',
'application/x-7z-compressed',
'application/vnd.openxmlformats-officedocument',
'application/vnd.ms-excel',
'application/vnd.ms-powerpoint',
'application/msword',
'audio/',
'video/',
]

# Check if content type is in the list of non-transcode types
should_transcode = not any(content_type.startswith(t) for t in non_transcode_types)

if should_transcode:
print("Transcoding content")
if isinstance(content, bytes):
content = content.decode('utf-8', errors='replace')

# Apply image caching for HTML-like content
if content_type.startswith('text/html') or '<html' in content.lower():
print("Applying image caching to HTML content")
content = replace_image_urls(content, base_url)

content = transcode_html(content, app.config["DISABLE_CHAR_CONVERSION"])
else:
print(f"Content is not text ({content_type}), passing through unchanged")
print(f"Content type {content_type} should not be transcoded, passing through unchanged")

response = Response(content, status_code)
for key, value in headers.items():
Expand All @@ -230,6 +287,9 @@ def handle_default_request():
print(f"Handling default request for URL: {url}")

try:
if is_image_url(url):
return handle_image_request(url)

resp = send_request(url, headers)
content = resp.content
status_code = resp.status_code
Expand Down

0 comments on commit 95f8be6

Please sign in to comment.