Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/pypi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,12 @@ jobs:
- name: Test that the basic pip built package runs without error
run: |
set -ex
pip3 install dist/changedetection.io*.whl
ls -alR

# Find and install the first .whl file
find dist -type f -name "*.whl" -exec pip3 install {} \; -quit
changedetection.io -d /tmp -p 10000 &

sleep 3
curl --retry-connrefused --retry 6 http://127.0.0.1:10000/static/styles/pure-min.css >/dev/null
curl --retry-connrefused --retry 6 http://127.0.0.1:10000/ >/dev/null
Expand Down
66 changes: 31 additions & 35 deletions changedetectionio/blueprint/browser_steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@

browsersteps_sessions = {}
io_interface_context = None

import json
import base64
import hashlib
from flask import Response

def construct_blueprint(datastore: ChangeDetectionStore):
browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates")
Expand Down Expand Up @@ -160,14 +163,13 @@ def browsersteps_ui_update():
if not browsersteps_sessions.get(browsersteps_session_id):
return make_response('No session exists under that ID', 500)


is_last_step = False
# Actions - step/apply/etc, do the thing and return state
if request.method == 'POST':
# @todo - should always be an existing session
step_operation = request.form.get('operation')
step_selector = request.form.get('selector')
step_optional_value = request.form.get('optional_value')
step_n = int(request.form.get('step_n'))
is_last_step = strtobool(request.form.get('is_last_step'))

# @todo try.. accept.. nice errors not popups..
Expand All @@ -182,48 +184,42 @@ def browsersteps_ui_update():
# Try to find something of value to give back to the user
return make_response(str(e).splitlines()[0], 401)

# Get visual selector ready/update its data (also use the current filter info from the page?)
# When the last 'apply' button was pressed
# @todo this adds overhead because the xpath selection is happening twice
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
if is_last_step and u:
(screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].request_visualselector_data()
watch = datastore.data['watching'].get(uuid)
if watch:
watch.save_screenshot(screenshot=screenshot)
watch.save_xpath_data(data=xpath_data)

# if not this_session.page:
# cleanup_playwright_session()
# return make_response('Browser session ran out of time :( Please reload this page.', 401)

# Screenshots and other info only needed on requesting a step (POST)
try:
state = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state()
(screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state()
if is_last_step:
watch = datastore.data['watching'].get(uuid)
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
if watch and u:
watch.save_screenshot(screenshot=screenshot)
watch.save_xpath_data(data=xpath_data)

except playwright._impl._api_types.Error as e:
return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401)
except Exception as e:
return make_response("Error fetching screenshot and element data - " + str(e), 401)

# SEND THIS BACK TO THE BROWSER

output = {
"screenshot": f"data:image/jpeg;base64,{base64.b64encode(screenshot).decode('ascii')}",
"xpath_data": xpath_data,
"session_age_start": browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start,
"browser_time_remaining": round(remaining)
}
json_data = json.dumps(output)

# Generate an ETag (hash of the response body)
etag_hash = hashlib.md5(json_data.encode('utf-8')).hexdigest()

# Use send_file() which is way faster than read/write loop on bytes
import json
from tempfile import mkstemp
from flask import send_file
tmp_fd, tmp_file = mkstemp(text=True, suffix=".json", prefix="changedetectionio-")

output = json.dumps({'screenshot': "data:image/jpeg;base64,{}".format(
base64.b64encode(state[0]).decode('ascii')),
'xpath_data': state[1],
'session_age_start': browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start,
'browser_time_remaining': round(remaining)
})

with os.fdopen(tmp_fd, 'w') as f:
f.write(output)

response = make_response(send_file(path_or_file=tmp_file,
mimetype='application/json; charset=UTF-8',
etag=True))
# No longer needed
os.unlink(tmp_file)
# Create the response with ETag
response = Response(json_data, mimetype="application/json; charset=UTF-8")
response.set_etag(etag_hash)

return response

Expand Down
43 changes: 18 additions & 25 deletions changedetectionio/blueprint/browser_steps/browser_steps.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#!/usr/bin/env python3

import os
import time
import re
from random import randint
from loguru import logger

from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
from changedetectionio.content_fetchers.base import manage_user_agent
from changedetectionio.safe_jinja import render as jinja_render



# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
# 0- off, 1- on
browser_step_ui_config = {'Choose one': '0 0',
Expand Down Expand Up @@ -279,6 +280,7 @@ def connect(self, proxy=None):
logger.debug(f"Time to browser setup {time.time()-now:.2f}s")
self.page.wait_for_timeout(1 * 1000)


def mark_as_closed(self):
logger.debug("Page closed, cleaning up..")

Expand All @@ -296,39 +298,30 @@ def get_current_state(self):
now = time.time()
self.page.wait_for_timeout(1 * 1000)

# The actual screenshot
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)

full_height = self.page.evaluate("document.documentElement.scrollHeight")

if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
logger.warning(f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
screenshot = capture_stitched_together_full_page(self.page)
else:
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)

logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s")

now = time.time()
self.page.evaluate("var include_filters=''")
# Go find the interactive elements
# @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers?
elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span'
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements)

xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
# So the JS will find the smallest one first
xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True)
logger.debug(f"Time to complete get_current_state of browser {time.time()-now:.2f}s")
# except
logger.debug(f"Time to scrape xpath element data in browser {time.time()-now:.2f}s")

# playwright._impl._api_types.Error: Browser closed.
# @todo show some countdown timer?
return (screenshot, xpath_data)

def request_visualselector_data(self):
"""
Does the same that the playwright operation in content_fetcher does
This is used to just bump the VisualSelector data so it' ready to go if they click on the tab
@todo refactor and remove duplicate code, add include_filters
:param xpath_data:
:param screenshot:
:param current_include_filters:
:return:
"""
import importlib.resources
self.page.evaluate("var include_filters=''")
xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
from changedetectionio.content_fetchers import visualselector_xpath_selectors
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))

return (screenshot, xpath_data)
104 changes: 104 additions & 0 deletions changedetectionio/content_fetchers/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@

# Pages with a vertical height longer than this will use the 'stitch together' method.

# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits.
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.


# The size at which we will switch to stitching method
SCREENSHOT_SIZE_STITCH_THRESHOLD=8000

from loguru import logger

def capture_stitched_together_full_page(page):
import io
import os
import time
from PIL import Image, ImageDraw, ImageFont

MAX_TOTAL_HEIGHT = SCREENSHOT_SIZE_STITCH_THRESHOLD*4 # Maximum total height for the final image (When in stitch mode)
MAX_CHUNK_HEIGHT = 4000 # Height per screenshot chunk
WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay

# Save the original viewport size
original_viewport = page.viewport_size
now = time.time()

try:
viewport = page.viewport_size
page_height = page.evaluate("document.documentElement.scrollHeight")

# Limit the total capture height
capture_height = min(page_height, MAX_TOTAL_HEIGHT)

images = []
total_captured_height = 0

for offset in range(0, capture_height, MAX_CHUNK_HEIGHT):
# Ensure we do not exceed the total height limit
chunk_height = min(MAX_CHUNK_HEIGHT, MAX_TOTAL_HEIGHT - total_captured_height)

# Adjust viewport size for this chunk
page.set_viewport_size({"width": viewport["width"], "height": chunk_height})

# Scroll to the correct position
page.evaluate(f"window.scrollTo(0, {offset})")

# Capture screenshot chunk
screenshot_bytes = page.screenshot(type='jpeg', quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
images.append(Image.open(io.BytesIO(screenshot_bytes)))

total_captured_height += chunk_height

# Stop if we reached the maximum total height
if total_captured_height >= MAX_TOTAL_HEIGHT:
break

# Create the final stitched image
stitched_image = Image.new('RGB', (viewport["width"], total_captured_height))
y_offset = 0

# Stitch the screenshot chunks together
for img in images:
stitched_image.paste(img, (0, y_offset))
y_offset += img.height

logger.debug(f"Screenshot stitched together in {time.time()-now:.2f}s")

# Overlay warning text if the screenshot was trimmed
if page_height > MAX_TOTAL_HEIGHT:
draw = ImageDraw.Draw(stitched_image)
warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long"

# Load font (default system font if Arial is unavailable)
try:
font = ImageFont.truetype("arial.ttf", WARNING_TEXT_HEIGHT) # Arial (Windows/Mac)
except IOError:
font = ImageFont.load_default() # Default font if Arial not found

# Get text bounding box (correct method for newer Pillow versions)
text_bbox = draw.textbbox((0, 0), warning_text, font=font)
text_width = text_bbox[2] - text_bbox[0] # Calculate text width
text_height = text_bbox[3] - text_bbox[1] # Calculate text height

# Define background rectangle (top of the image)
draw.rectangle([(0, 0), (viewport["width"], WARNING_TEXT_HEIGHT)], fill="white")

# Center text horizontally within the warning area
text_x = (viewport["width"] - text_width) // 2
text_y = (WARNING_TEXT_HEIGHT - text_height) // 2

# Draw the warning text in red
draw.text((text_x, text_y), warning_text, fill="red", font=font)

# Save or return the final image
output = io.BytesIO()
stitched_image.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
screenshot = output.getvalue()

finally:
# Restore the original viewport size
page.set_viewport_size(original_viewport)

return screenshot
18 changes: 14 additions & 4 deletions changedetectionio/content_fetchers/playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from loguru import logger

from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable

Expand Down Expand Up @@ -89,6 +90,7 @@ def run(self,
from playwright.sync_api import sync_playwright
import playwright._impl._errors
from changedetectionio.content_fetchers import visualselector_xpath_selectors
import time
self.delete_browser_steps_screenshots()
response = None

Expand Down Expand Up @@ -179,6 +181,7 @@ def run(self,

self.page.wait_for_timeout(extra_wait * 1000)

now = time.time()
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
if current_include_filters is not None:
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
Expand All @@ -190,6 +193,8 @@ def run(self,
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")

self.content = self.page.content()
logger.debug(f"Time to scrape xpath element data in browser {time.time() - now:.2f}s")

# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
Expand All @@ -199,10 +204,15 @@ def run(self,
# acceptable screenshot quality here
try:
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
self.screenshot = self.page.screenshot(type='jpeg',
full_page=True,
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)),
)
full_height = self.page.evaluate("document.documentElement.scrollHeight")

if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
logger.warning(
f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
self.screenshot = capture_stitched_together_full_page(self.page)
else:
self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))

except Exception as e:
# It's likely the screenshot was too long/big and something crashed
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
Expand Down
Loading