diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index 353f98e46a3..21abd2827ba 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -4,7 +4,7 @@ from random import randint from loguru import logger -from changedetectionio.content_fetchers.helpers import capture_full_page +from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT from changedetectionio.content_fetchers.base import manage_user_agent from changedetectionio.safe_jinja import render as jinja_render @@ -293,12 +293,16 @@ def has_expired(self): def get_current_state(self): """Return the screenshot and interactive elements mapping, generally always called after action_()""" import importlib.resources + import json + # because we for now only run browser steps in playwright mode (not puppeteer mode) + from changedetectionio.content_fetchers.playwright import capture_full_page + xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text() now = time.time() self.page.wait_for_timeout(1 * 1000) - screenshot = capture_full_page(self.page) + screenshot = capture_full_page(page=self.page) logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s") @@ -306,13 +310,21 @@ def get_current_state(self): self.page.evaluate("var include_filters=''") # Go find the interactive elements # @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers? - elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span' - xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements) - xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}") + self.page.request_gc() + + scan_elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span' + + MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) + xpath_data = json.loads(self.page.evaluate(xpath_element_js, { + "visualselector_xpath_selectors": scan_elements, + "max_height": MAX_TOTAL_HEIGHT + })) + self.page.request_gc() + # So the JS will find the smallest one first xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True) - logger.debug(f"Time to scrape xpath element data in browser {time.time()-now:.2f}s") + logger.debug(f"Time to scrape xPath element data in browser {time.time()-now:.2f}s") # playwright._impl._api_types.Error: Browser closed. # @todo show some countdown timer? diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py index 94489f6d5ba..4d9145fae07 100644 --- a/changedetectionio/content_fetchers/__init__.py +++ b/changedetectionio/content_fetchers/__init__.py @@ -7,11 +7,29 @@ # Visual Selector scraper - 'Button' is there because some sites have . visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button' +SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000 +SCREENSHOT_DEFAULT_QUALITY = 40 + +# Maximum total height for the final image (When in stitch mode). +# We limit this to 16000px due to the huge amount of RAM that was being used +# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc) +MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) + +# The size at which we will switch to stitching method, when below this (and +# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default +# screenshot method. +SCREENSHOT_SIZE_STITCH_THRESHOLD = 8000 # available_fetchers() will scan this implementation looking for anything starting with html_ # this information is used in the form selections from changedetectionio.content_fetchers.requests import fetcher as html_requests + +import importlib.resources +XPATH_ELEMENT_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8') +INSTOCK_DATA_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8') + + def available_fetchers(): # See the if statement at the bottom of this file for how we switch between playwright and webdriver import inspect diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index a482fbdc92c..bfa7e83cdfb 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -63,11 +63,6 @@ class Fetcher(): # Time ONTOP of the system defined env minimum time render_extract_delay = 0 - def __init__(self): - import importlib.resources - self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8') - self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8') - @abstractmethod def get_error(self): return self.error @@ -143,6 +138,7 @@ def iterate_browser_steps(self, start_url=None): logger.debug(f">> Iterating check - browser Step n {step_n} - {step['operation']}...") self.screenshot_step("before-" + str(step_n)) self.save_step_html("before-" + str(step_n)) + try: optional_value = step['optional_value'] selector = step['selector'] diff --git a/changedetectionio/content_fetchers/helpers.py b/changedetectionio/content_fetchers/helpers.py deleted file mode 100644 index def26ca3b59..00000000000 --- a/changedetectionio/content_fetchers/helpers.py +++ /dev/null @@ -1,138 +0,0 @@ -# Pages with a vertical height longer than this will use the 'stitch together' method. - -# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices). -# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits. -# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer. - -from loguru import logger - -def capture_full_page(page): - import io - import os - import time - from PIL import Image, ImageDraw, ImageFont - - # Maximum total height for the final image (When in stitch mode). - # We limit this to 16000px due to the huge amount of RAM that was being used - # Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc) - MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", 16000)) - - # The size at which we will switch to stitching method, when below this (and - # MAX_TOTAL_HEIGHT which can be set by a user) we will use the default - # screenshot method. - SCREENSHOT_SIZE_STITCH_THRESHOLD = 8000 - - WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay - - # Save the original viewport size - original_viewport = page.viewport_size - start = time.time() - - stitched_image = None - - try: - viewport_width = original_viewport["width"] - viewport_height = original_viewport["height"] - - page_height = page.evaluate("document.documentElement.scrollHeight") - - # Optimization to avoid unnecessary stitching if we can avoid it - # Use the default screenshot method for smaller pages to take advantage - # of GPU and native playwright screenshot optimizations - if ( - page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD - and page_height < MAX_TOTAL_HEIGHT - ): - logger.debug("Using default screenshot method") - screenshot = page.screenshot( - type="jpeg", - quality=int(os.getenv("SCREENSHOT_QUALITY", 30)), - full_page=True, - ) - logger.debug(f"Screenshot captured in {time.time() - start:.2f}s") - return screenshot - - logger.debug( - "Using stitching method for large screenshot because page height exceeds threshold" - ) - - # Limit the total capture height - capture_height = min(page_height, MAX_TOTAL_HEIGHT) - - # Calculate number of chunks needed using ORIGINAL viewport height - num_chunks = (capture_height + viewport_height - 1) // viewport_height - - # Create the final image upfront to avoid holding all chunks in memory - stitched_image = Image.new("RGB", (viewport_width, capture_height)) - - # Track cumulative paste position - y_offset = 0 - - for _ in range(num_chunks): - # Scroll to position (no viewport resizing) - page.evaluate(f"window.scrollTo(0, {y_offset})") - - # Capture only the visible area using clip - with io.BytesIO( - page.screenshot( - type="jpeg", - clip={ - "x": 0, - "y": 0, - "width": viewport_width, - "height": min(viewport_height, capture_height - y_offset), - }, - quality=int(os.getenv("SCREENSHOT_QUALITY", 30)), - ) - ) as buf: - with Image.open(buf) as img: - img.load() - stitched_image.paste(img, (0, y_offset)) - y_offset += img.height - - logger.debug(f"Screenshot stitched together in {time.time() - start:.2f}s") - - # Overlay warning text if the screenshot was trimmed - if capture_height < page_height: - draw = ImageDraw.Draw(stitched_image) - warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long" - - # Load font (default system font if Arial is unavailable) - try: - font = ImageFont.truetype( - "arial.ttf", WARNING_TEXT_HEIGHT - ) # Arial (Windows/Mac) - except IOError: - font = ImageFont.load_default() # Default font if Arial not found - - # Get text bounding box (correct method for newer Pillow versions) - text_bbox = draw.textbbox((0, 0), warning_text, font=font) - text_width = text_bbox[2] - text_bbox[0] # Calculate text width - text_height = text_bbox[3] - text_bbox[1] # Calculate text height - - # Define background rectangle (top of the image) - draw.rectangle( - [(0, 0), (viewport_width, WARNING_TEXT_HEIGHT)], fill="white" - ) - - # Center text horizontally within the warning area - text_x = (viewport_width - text_width) // 2 - text_y = (WARNING_TEXT_HEIGHT - text_height) // 2 - - # Draw the warning text in red - draw.text((text_x, text_y), warning_text, fill="red", font=font) - - # Save final image - with io.BytesIO() as output: - stitched_image.save( - output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30)) - ) - screenshot = output.getvalue() - - finally: - # Restore the original viewport size - page.set_viewport_size(original_viewport) - if stitched_image is not None: - stitched_image.close() - - return screenshot diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index 207b8bb8dfb..411f8c9592b 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -4,10 +4,102 @@ from loguru import logger -from changedetectionio.content_fetchers.helpers import capture_full_page +from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \ + SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS +from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable + + +def capture_full_page(page): + import os + import time + from multiprocessing import Process, Pipe + + start = time.time() + + page_height = page.evaluate("document.documentElement.scrollHeight") + + logger.debug(f"Playwright viewport size {page.viewport_size}") + + ############################################################ + #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) ##### + ############################################################ + + # Optimization to avoid unnecessary stitching if we can avoid it + # Use the default screenshot method for smaller pages to take advantage + # of GPU and native playwright screenshot optimizations + # - No PIL needed here, no danger of memory leaks, no sub process required + if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ): + logger.debug("Using default screenshot method") + page.request_gc() + screenshot = page.screenshot( + type="jpeg", + quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), + full_page=True, + ) + page.request_gc() + logger.debug(f"Screenshot captured in {time.time() - start:.2f}s") + return screenshot + + + + ################################################################################### + #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES ##### + ################################################################################### + # - PIL can easily allocate memory and not release it cleanly + # - Fetching screenshot from playwright seems OK + # Image.new is leaky even with .close() + # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling + + logger.debug( + "Using stitching method for large screenshot because page height exceeds threshold" + ) + + # Limit the total capture height + capture_height = min(page_height, MAX_TOTAL_HEIGHT) + + # Calculate number of chunks needed using ORIGINAL viewport height + num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height'] + screenshot_chunks = [] + + # Track cumulative paste position + y_offset = 0 + for _ in range(num_chunks): + + page.request_gc() + page.evaluate(f"window.scrollTo(0, {y_offset})") + page.request_gc() + h = min(page.viewport_size['height'], capture_height - y_offset) + screenshot_chunks.append(page.screenshot( + type="jpeg", + clip={ + "x": 0, + "y": 0, + "width": page.viewport_size['width'], + "height": h, + }, + quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), + )) + + y_offset += h # maybe better to inspect the image here? + page.request_gc() + + # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling + + parent_conn, child_conn = Pipe() + p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height)) + p.start() + result = parent_conn.recv_bytes() + p.join() + + screenshot_chunks = None + logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s") + + return result + + class fetcher(Fetcher): fetcher_description = "Playwright {}/Javascript".format( os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() @@ -60,7 +152,8 @@ def __init__(self, proxy_override=None, custom_browser_connection_url=None): def screenshot_step(self, step_n=''): super().screenshot_step(step_n=step_n) - screenshot = capture_full_page(self.page) + screenshot = capture_full_page(page=self.page) + if self.browser_steps_screenshot_path is not None: destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) @@ -89,7 +182,6 @@ def run(self, from playwright.sync_api import sync_playwright import playwright._impl._errors - from changedetectionio.content_fetchers import visualselector_xpath_selectors import time self.delete_browser_steps_screenshots() response = None @@ -185,13 +277,22 @@ def run(self, self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) else: self.page.evaluate("var include_filters=''") + self.page.request_gc() + + # request_gc before and after evaluate to free up memory + # @todo browsersteps etc + MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) + self.xpath_data = self.page.evaluate(XPATH_ELEMENT_JS, { + "visualselector_xpath_selectors": visualselector_xpath_selectors, + "max_height": MAX_TOTAL_HEIGHT + }) + self.page.request_gc() - self.xpath_data = self.page.evaluate( - "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}") - self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}") + self.instock_data = self.page.evaluate(INSTOCK_DATA_JS) + self.page.request_gc() self.content = self.page.content() - logger.debug(f"Time to scrape xpath element data in browser {time.time() - now:.2f}s") + logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s") # Bug 3 in Playwright screenshot handling # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it @@ -202,11 +303,18 @@ def run(self, # acceptable screenshot quality here try: # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage - self.screenshot = capture_full_page(self.page) + self.screenshot = capture_full_page(page=self.page) except Exception as e: # It's likely the screenshot was too long/big and something crashed raise ScreenshotUnavailable(url=url, status_code=self.status_code) finally: + # Request garbage collection one more time before closing + try: + self.page.request_gc() + except: + pass + + # Clean up resources properly context.close() browser.close() diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index 9dd06c38e4a..bdb0f4a3df9 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -6,8 +6,93 @@ from loguru import logger +from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \ + SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError +from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker + + +# Bug 3 in Playwright screenshot handling +# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it + +# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded +# which will significantly increase the IO size between the server and client, it's recommended to use the lowest +# acceptable screenshot quality here +async def capture_full_page(page): + import os + import time + from multiprocessing import Process, Pipe + + start = time.time() + + page_height = await page.evaluate("document.documentElement.scrollHeight") + + logger.debug(f"Puppeteer viewport size {page.viewport}") + + ############################################################ + #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) ##### + ############################################################ + + # Optimization to avoid unnecessary stitching if we can avoid it + # Use the default screenshot method for smaller pages to take advantage + # of GPU and native playwright screenshot optimizations + # - No PIL needed here, no danger of memory leaks, no sub process required + if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ): + logger.debug("Using default screenshot method") + await page.evaluate(f"window.scrollTo(0, 0)") + screenshot = await page.screenshot( + type_="jpeg", + quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), + fullPage=True, + ) + logger.debug(f"Screenshot captured in {time.time() - start:.2f}s") + return screenshot + + ################################################################################### + #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES ##### + ################################################################################### + # - PIL can easily allocate memory and not release it cleanly + # - Fetching screenshot from playwright seems OK + # Image.new is leaky even with .close() + # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling + + logger.debug( + "Using stitching method for large screenshot because page height exceeds threshold" + ) + + # Limit the total capture height + capture_height = min(page_height, MAX_TOTAL_HEIGHT) + + # Calculate number of chunks needed using ORIGINAL viewport height + num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height'] + screenshot_chunks = [] + + # Track cumulative paste position + y_offset = 0 + for _ in range(num_chunks): + await page.evaluate(f"window.scrollTo(0, {y_offset})") + h = min(page.viewport['height'], capture_height - y_offset) + screenshot_chunks.append(await page.screenshot( + type_="jpeg", + quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), + )) + + y_offset += h # maybe better to inspect the image here? + + # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling + + parent_conn, child_conn = Pipe() + p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height)) + p.start() + result = parent_conn.recv_bytes() + p.join() + + screenshot_chunks = None + logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s") + + return result + class fetcher(Fetcher): fetcher_description = "Puppeteer/direct {}/Javascript".format( @@ -79,7 +164,6 @@ async def fetch_page(self, empty_pages_are_a_change ): - from changedetectionio.content_fetchers import visualselector_xpath_selectors self.delete_browser_steps_screenshots() extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay @@ -181,11 +265,10 @@ async def fetch_page(self, raise PageUnloadable(url=url, status_code=None, message=str(e)) if self.status_code != 200 and not ignore_status_codes: - screenshot = await self.page.screenshot(type_='jpeg', - fullPage=True, - quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) + screenshot = await capture_full_page(page=self.page) raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) + content = await self.page.content if not empty_pages_are_a_change and len(content.strip()) == 0: @@ -203,46 +286,31 @@ async def fetch_page(self, # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) # Setup the xPath/VisualSelector scraper - if current_include_filters is not None: + if current_include_filters: js = json.dumps(current_include_filters) await self.page.evaluate(f"var include_filters={js}") else: await self.page.evaluate(f"var include_filters=''") - self.xpath_data = await self.page.evaluate( - "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}") - self.instock_data = await self.page.evaluate("async () => {" + self.instock_data_js + "}") + MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) + self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, { + "visualselector_xpath_selectors": visualselector_xpath_selectors, + "max_height": MAX_TOTAL_HEIGHT + }) + if not self.xpath_data: + raise Exception(f"Content Fetcher > xPath scraper failed. Please report this URL so we can fix it :)") + + self.instock_data = await self.page.evaluate(INSTOCK_DATA_JS) self.content = await self.page.content - # Bug 3 in Playwright screenshot handling - # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it - # JPEG is better here because the screenshots can be very very large - # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded - # which will significantly increase the IO size between the server and client, it's recommended to use the lowest - # acceptable screenshot quality here - try: - self.screenshot = await self.page.screenshot(type_='jpeg', - fullPage=True, - quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) - except Exception as e: - logger.error("Error fetching screenshot") - # // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw' - # // @ todo after text extract, we can place some overlay text with red background to say 'croppped' - logger.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot') - try: - self.screenshot = await self.page.screenshot(type_='jpeg', - fullPage=False, - quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) - except Exception as e: - logger.error('ERROR: Failed to get viewport-only reduced screenshot :(') - pass - finally: - # It's good to log here in the case that the browser crashes on shutting down but we still get the data we need - logger.success(f"Fetching '{url}' complete, closing page") - await self.page.close() - logger.success(f"Fetching '{url}' complete, closing browser") - await browser.close() + self.screenshot = await capture_full_page(page=self.page) + + # It's good to log here in the case that the browser crashes on shutting down but we still get the data we need + logger.success(f"Fetching '{url}' complete, closing page") + await self.page.close() + logger.success(f"Fetching '{url}' complete, closing browser") + await browser.close() logger.success(f"Fetching '{url}' complete, exiting puppeteer fetch.") async def main(self, **kwargs): diff --git a/changedetectionio/content_fetchers/res/puppeteer_fetch.js b/changedetectionio/content_fetchers/res/puppeteer_fetch.js deleted file mode 100644 index 21c5abc87ea..00000000000 --- a/changedetectionio/content_fetchers/res/puppeteer_fetch.js +++ /dev/null @@ -1,190 +0,0 @@ -module.exports = async ({page, context}) => { - - var { - url, - execute_js, - user_agent, - extra_wait_ms, - req_headers, - include_filters, - xpath_element_js, - screenshot_quality, - proxy_username, - proxy_password, - disk_cache_dir, - no_cache_list, - block_url_list, - } = context; - - await page.setBypassCSP(true) - await page.setExtraHTTPHeaders(req_headers); - - if (user_agent) { - await page.setUserAgent(user_agent); - } - // https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded - - await page.setDefaultNavigationTimeout(0); - - if (proxy_username) { - // Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer - // https://github.com/puppeteer/puppeteer/issues/676 ? - // https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2 - // https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/ - await page.authenticate({ - username: proxy_username, - password: proxy_password - }); - } - - await page.setViewport({ - width: 1024, - height: 768, - deviceScaleFactor: 1, - }); - - await page.setRequestInterception(true); - if (disk_cache_dir) { - console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<"); - } - const fs = require('fs'); - const crypto = require('crypto'); - - function file_is_expired(file_path) { - if (!fs.existsSync(file_path)) { - return true; - } - var stats = fs.statSync(file_path); - const now_date = new Date(); - const expire_seconds = 300; - if ((now_date / 1000) - (stats.mtime.getTime() / 1000) > expire_seconds) { - console.log("CACHE EXPIRED: " + file_path); - return true; - } - return false; - - } - - page.on('request', async (request) => { - // General blocking of requests that waste traffic - if (block_url_list.some(substring => request.url().toLowerCase().includes(substring))) return request.abort(); - - if (disk_cache_dir) { - const url = request.url(); - const key = crypto.createHash('md5').update(url).digest("hex"); - const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; - - // https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js - - if (fs.existsSync(dir_path + key)) { - console.log("* CACHE HIT , using - " + dir_path + key + " - " + url); - const cached_data = fs.readFileSync(dir_path + key); - // @todo headers can come from dir_path+key+".meta" json file - request.respond({ - status: 200, - //contentType: 'text/html', //@todo - body: cached_data - }); - return; - } - } - request.continue(); - }); - - - if (disk_cache_dir) { - page.on('response', async (response) => { - const url = response.url(); - // Basic filtering for sane responses - if (response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200) { - console.log("Skipping (not useful) - Status:" + response.status() + " Method:" + response.request().method() + " ResourceType:" + response.request().resourceType() + " " + url); - return; - } - if (no_cache_list.some(substring => url.toLowerCase().includes(substring))) { - console.log("Skipping (no_cache_list) - " + url); - return; - } - if (url.toLowerCase().includes('data:')) { - console.log("Skipping (embedded-data) - " + url); - return; - } - response.buffer().then(buffer => { - if (buffer.length > 100) { - console.log("Cache - Saving " + response.request().method() + " - " + url + " - " + response.request().resourceType()); - - const key = crypto.createHash('md5').update(url).digest("hex"); - const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; - - if (!fs.existsSync(dir_path)) { - fs.mkdirSync(dir_path, {recursive: true}) - } - - if (fs.existsSync(dir_path + key)) { - if (file_is_expired(dir_path + key)) { - fs.writeFileSync(dir_path + key, buffer); - } - } else { - fs.writeFileSync(dir_path + key, buffer); - } - } - }); - }); - } - - const r = await page.goto(url, { - waitUntil: 'load' - }); - - await page.waitForTimeout(1000); - await page.waitForTimeout(extra_wait_ms); - - if (execute_js) { - await page.evaluate(execute_js); - await page.waitForTimeout(200); - } - - var xpath_data; - var instock_data; - try { - // Not sure the best way here, in the future this should be a new package added to npm then run in evaluatedCode - // (Once the old playwright is removed) - xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters); - instock_data = await page.evaluate(() => {%instock_scrape_code%}); - } catch (e) { - console.log(e); - } - - // Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure - // Wrap it here (for now) - - var b64s = false; - try { - b64s = await page.screenshot({encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg'}); - } catch (e) { - console.log(e); - } - - // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw' - if (!b64s) { - // @todo after text extract, we can place some overlay text with red background to say 'croppped' - console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot'); - try { - b64s = await page.screenshot({encoding: "base64", quality: screenshot_quality, type: 'jpeg'}); - } catch (e) { - console.log(e); - } - } - - var html = await page.content(); - return { - data: { - 'content': html, - 'headers': r.headers(), - 'instock_data': instock_data, - 'screenshot': b64s, - 'status_code': r.status(), - 'xpath_data': xpath_data - }, - type: 'application/json', - }; -}; \ No newline at end of file diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js index 1ebf6da6a4d..098b208ac32 100644 --- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js +++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js @@ -1,229 +1,220 @@ -// Restock Detector -// (c) Leigh Morresi dgtlmoon@gmail.com -// -// Assumes the product is in stock to begin with, unless the following appears above the fold ; -// - outOfStockTexts appears above the fold (out of stock) -// - negateOutOfStockRegex (really is in stock) - -function isItemInStock() { - // @todo Pass these in so the same list can be used in non-JS fetchers - const outOfStockTexts = [ - ' أخبرني عندما يتوفر', - '0 in stock', - 'actuellement indisponible', - 'agotado', - 'article épuisé', - 'artikel zurzeit vergriffen', - 'as soon as stock is available', - 'ausverkauft', // sold out - 'available for back order', - 'awaiting stock', - 'back in stock soon', - 'back-order or out of stock', - 'backordered', - 'benachrichtigt mich', // notify me - 'brak na stanie', - 'brak w magazynie', - 'coming soon', - 'currently have any tickets for this', - 'currently unavailable', - 'dieser artikel ist bald wieder verfügbar', - 'dostępne wkrótce', - 'en rupture', - 'en rupture de stock', - 'épuisé', - 'esgotado', - 'indisponible', - 'indisponível', - 'isn\'t in stock right now', - 'isnt in stock right now', - 'isn’t in stock right now', - 'item is no longer available', - 'let me know when it\'s available', - 'mail me when available', - 'message if back in stock', - 'mevcut değil', - 'nachricht bei', - 'nicht auf lager', - 'nicht lagernd', - 'nicht lieferbar', - 'nicht verfügbar', - 'nicht vorrätig', - 'nicht zur verfügung', - 'nie znaleziono produktów', - 'niet beschikbaar', - 'niet leverbaar', - 'niet op voorraad', - 'no disponible', - 'non disponibile', - 'non disponible', - 'no longer in stock', - 'no tickets available', - 'not available', - 'not currently available', - 'not in stock', - 'notify me when available', - 'notify me', - 'notify when available', - 'não disponível', - 'não estamos a aceitar encomendas', - 'out of stock', - 'out-of-stock', - 'plus disponible', - 'prodotto esaurito', - 'produkt niedostępny', - 'rupture', - 'sold out', - 'sold-out', - 'stok habis', - 'stok kosong', - 'stok varian ini habis', - 'stokta yok', - 'temporarily out of stock', - 'temporarily unavailable', - 'there were no search results for', - 'this item is currently unavailable', - 'tickets unavailable', - 'tidak dijual', - 'tidak tersedia', - 'tijdelijk uitverkocht', - 'tiket tidak tersedia', - 'tükendi', - 'unavailable nearby', - 'unavailable tickets', - 'vergriffen', - 'vorbestellen', - 'vorbestellung ist bald möglich', - 'we don\'t currently have any', - 'we couldn\'t find any products that match', - 'we do not currently have an estimate of when this product will be back in stock.', - 'we don\'t know when or if this item will be back in stock.', - 'we were not able to find a match', - 'when this arrives in stock', - 'zur zeit nicht an lager', - '品切れ', - '已售', - '已售完', - '품절' - ]; - - - const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); - - function getElementBaseText(element) { - // .textContent can include text from children which may give the wrong results - // scan only immediate TEXT_NODEs, which will be a child of the element - var text = ""; - for (var i = 0; i < element.childNodes.length; ++i) - if (element.childNodes[i].nodeType === Node.TEXT_NODE) - text += element.childNodes[i].textContent; - return text.toLowerCase().trim(); - } +async () => { + + function isItemInStock() { + // @todo Pass these in so the same list can be used in non-JS fetchers + const outOfStockTexts = [ + ' أخبرني عندما يتوفر', + '0 in stock', + 'actuellement indisponible', + 'agotado', + 'article épuisé', + 'artikel zurzeit vergriffen', + 'as soon as stock is available', + 'ausverkauft', // sold out + 'available for back order', + 'awaiting stock', + 'back in stock soon', + 'back-order or out of stock', + 'backordered', + 'benachrichtigt mich', // notify me + 'brak na stanie', + 'brak w magazynie', + 'coming soon', + 'currently have any tickets for this', + 'currently unavailable', + 'dieser artikel ist bald wieder verfügbar', + 'dostępne wkrótce', + 'en rupture', + 'en rupture de stock', + 'épuisé', + 'esgotado', + 'indisponible', + 'indisponível', + 'isn\'t in stock right now', + 'isnt in stock right now', + 'isn’t in stock right now', + 'item is no longer available', + 'let me know when it\'s available', + 'mail me when available', + 'message if back in stock', + 'mevcut değil', + 'nachricht bei', + 'nicht auf lager', + 'nicht lagernd', + 'nicht lieferbar', + 'nicht verfügbar', + 'nicht vorrätig', + 'nicht zur verfügung', + 'nie znaleziono produktów', + 'niet beschikbaar', + 'niet leverbaar', + 'niet op voorraad', + 'no disponible', + 'non disponibile', + 'non disponible', + 'no longer in stock', + 'no tickets available', + 'not available', + 'not currently available', + 'not in stock', + 'notify me when available', + 'notify me', + 'notify when available', + 'não disponível', + 'não estamos a aceitar encomendas', + 'out of stock', + 'out-of-stock', + 'plus disponible', + 'prodotto esaurito', + 'produkt niedostępny', + 'rupture', + 'sold out', + 'sold-out', + 'stok habis', + 'stok kosong', + 'stok varian ini habis', + 'stokta yok', + 'temporarily out of stock', + 'temporarily unavailable', + 'there were no search results for', + 'this item is currently unavailable', + 'tickets unavailable', + 'tidak dijual', + 'tidak tersedia', + 'tijdelijk uitverkocht', + 'tiket tidak tersedia', + 'tükendi', + 'unavailable nearby', + 'unavailable tickets', + 'vergriffen', + 'vorbestellen', + 'vorbestellung ist bald möglich', + 'we don\'t currently have any', + 'we couldn\'t find any products that match', + 'we do not currently have an estimate of when this product will be back in stock.', + 'we don\'t know when or if this item will be back in stock.', + 'we were not able to find a match', + 'when this arrives in stock', + 'zur zeit nicht an lager', + '品切れ', + '已售', + '已售完', + '품절' + ]; + + + const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + + function getElementBaseText(element) { + // .textContent can include text from children which may give the wrong results + // scan only immediate TEXT_NODEs, which will be a child of the element + var text = ""; + for (var i = 0; i < element.childNodes.length; ++i) + if (element.childNodes[i].nodeType === Node.TEXT_NODE) + text += element.childNodes[i].textContent; + return text.toLowerCase().trim(); + } - const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig'); + const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig'); - // The out-of-stock or in-stock-text is generally always above-the-fold - // and often below-the-fold is a list of related products that may or may not contain trigger text - // so it's good to filter to just the 'above the fold' elements - // and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist + // The out-of-stock or in-stock-text is generally always above-the-fold + // and often below-the-fold is a list of related products that may or may not contain trigger text + // so it's good to filter to just the 'above the fold' elements + // and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist // @todo - if it's SVG or IMG, go into image diff mode -// %ELEMENTS% replaced at injection time because different interfaces use it with different settings - - console.log("Scanning %ELEMENTS%"); - - function collectVisibleElements(parent, visibleElements) { - if (!parent) return; // Base case: if parent is null or undefined, return - - // Add the parent itself to the visible elements array if it's of the specified types - visibleElements.push(parent); - - // Iterate over the parent's children - const children = parent.children; - for (let i = 0; i < children.length; i++) { - const child = children[i]; - if ( - child.nodeType === Node.ELEMENT_NODE && - window.getComputedStyle(child).display !== 'none' && - window.getComputedStyle(child).visibility !== 'hidden' && - child.offsetWidth >= 0 && - child.offsetHeight >= 0 && - window.getComputedStyle(child).contentVisibility !== 'hidden' - ) { - // If the child is an element and is visible, recursively collect visible elements - collectVisibleElements(child, visibleElements); + + function collectVisibleElements(parent, visibleElements) { + if (!parent) return; // Base case: if parent is null or undefined, return + + // Add the parent itself to the visible elements array if it's of the specified types + visibleElements.push(parent); + + // Iterate over the parent's children + const children = parent.children; + for (let i = 0; i < children.length; i++) { + const child = children[i]; + if ( + child.nodeType === Node.ELEMENT_NODE && + window.getComputedStyle(child).display !== 'none' && + window.getComputedStyle(child).visibility !== 'hidden' && + child.offsetWidth >= 0 && + child.offsetHeight >= 0 && + window.getComputedStyle(child).contentVisibility !== 'hidden' + ) { + // If the child is an element and is visible, recursively collect visible elements + collectVisibleElements(child, visibleElements); + } } } - } - const elementsToScan = []; - collectVisibleElements(document.body, elementsToScan); + const elementsToScan = []; + collectVisibleElements(document.body, elementsToScan); + + var elementText = ""; + + // REGEXS THAT REALLY MEAN IT'S IN STOCK + for (let i = elementsToScan.length - 1; i >= 0; i--) { + const element = elementsToScan[i]; - var elementText = ""; + // outside the 'fold' or some weird text in the heading area + // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden + if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) { + continue + } - // REGEXS THAT REALLY MEAN IT'S IN STOCK - for (let i = elementsToScan.length - 1; i >= 0; i--) { - const element = elementsToScan[i]; + elementText = ""; + try { + if (element.tagName.toLowerCase() === "input") { + elementText = element.value.toLowerCase().trim(); + } else { + elementText = getElementBaseText(element); + } + } catch (e) { + console.warn('stock-not-in-stock.js scraper - handling element for gettext failed', e); + } - // outside the 'fold' or some weird text in the heading area - // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden - if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) { - continue + if (elementText.length) { + // try which ones could mean its in stock + if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) { + console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`) + return 'Possibly in stock'; + } + } } - elementText = ""; - try { + // OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK + for (let i = elementsToScan.length - 1; i >= 0; i--) { + const element = elementsToScan[i]; + // outside the 'fold' or some weird text in the heading area + // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden + // Note: theres also an automated test that places the 'out of stock' text fairly low down + if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) { + continue + } + elementText = ""; if (element.tagName.toLowerCase() === "input") { elementText = element.value.toLowerCase().trim(); } else { elementText = getElementBaseText(element); } - } catch (e) { - console.warn('stock-not-in-stock.js scraper - handling element for gettext failed', e); - } - if (elementText.length) { - // try which ones could mean its in stock - if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) { - console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`) - return 'Possibly in stock'; - } - } - } - - // OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK - for (let i = elementsToScan.length - 1; i >= 0; i--) { - const element = elementsToScan[i]; - // outside the 'fold' or some weird text in the heading area - // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden - // Note: theres also an automated test that places the 'out of stock' text fairly low down - if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) { - continue - } - elementText = ""; - if (element.tagName.toLowerCase() === "input") { - elementText = element.value.toLowerCase().trim(); - } else { - elementText = getElementBaseText(element); - } - - if (elementText.length) { - // and these mean its out of stock - for (const outOfStockText of outOfStockTexts) { - if (elementText.includes(outOfStockText)) { - console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`) - return outOfStockText; // item is out of stock + if (elementText.length) { + // and these mean its out of stock + for (const outOfStockText of outOfStockTexts) { + if (elementText.includes(outOfStockText)) { + console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`) + return outOfStockText; // item is out of stock + } } } } - } - console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`) - return 'Possibly in stock'; // possibly in stock, cant decide otherwise. -} + console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`) + return 'Possibly in stock'; // possibly in stock, cant decide otherwise. + } // returns the element text that makes it think it's out of stock -return isItemInStock().trim() - - + return isItemInStock().trim() +} diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js index 182a9b1df92..3d6bc7c7cfb 100644 --- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js +++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js @@ -1,285 +1,285 @@ -// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com) -// All rights reserved. - -// @file Scrape the page looking for elements of concern (%ELEMENTS%) -// http://matatk.agrip.org.uk/tests/position-and-width/ -// https://stackoverflow.com/questions/26813480/when-is-element-getboundingclientrect-guaranteed-to-be-updated-accurate -// -// Some pages like https://www.londonstockexchange.com/stock/NCCL/ncondezi-energy-limited/analysis -// will automatically force a scroll somewhere, so include the position offset -// Lets hope the position doesnt change while we iterate the bbox's, but this is better than nothing -var scroll_y = 0; -try { - scroll_y = +document.documentElement.scrollTop || document.body.scrollTop -} catch (e) { - console.log(e); -} +async (options) => { + let visualselector_xpath_selectors = options.visualselector_xpath_selectors + let max_height = options.max_height + + var scroll_y = 0; + try { + scroll_y = +document.documentElement.scrollTop || document.body.scrollTop + } catch (e) { + console.log(e); + } // Include the getXpath script directly, easier than fetching -function getxpath(e) { - var n = e; - if (n && n.id) return '//*[@id="' + n.id + '"]'; - for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) { - for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling; - for (d = n.nextSibling; d;) { - if (d.nodeName === n.nodeName) { - r = !0; - break + function getxpath(e) { + var n = e; + if (n && n.id) return '//*[@id="' + n.id + '"]'; + for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) { + for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling; + for (d = n.nextSibling; d;) { + if (d.nodeName === n.nodeName) { + r = !0; + break + } + d = d.nextSibling } - d = d.nextSibling + o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode } - o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode + return o.length ? "/" + o.reverse().join("/") : "" } - return o.length ? "/" + o.reverse().join("/") : "" -} -const findUpTag = (el) => { - let r = el - chained_css = []; - depth = 0; - - // Strategy 1: If it's an input, with name, and there's only one, prefer that - if (el.name !== undefined && el.name.length) { - var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]"; - var proposed_element = window.document.querySelectorAll(proposed); - if (proposed_element.length) { - if (proposed_element.length === 1) { - return proposed; - } else { - // Some sites change ID but name= stays the same, we can hit it if we know the index - // Find all the elements that match and work out the input[n] - var n = Array.from(proposed_element).indexOf(el); - // Return a Playwright selector for nthinput[name=zipcode] - return proposed + " >> nth=" + n; + const findUpTag = (el) => { + let r = el + chained_css = []; + depth = 0; + + // Strategy 1: If it's an input, with name, and there's only one, prefer that + if (el.name !== undefined && el.name.length) { + var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]"; + var proposed_element = window.document.querySelectorAll(proposed); + if (proposed_element.length) { + if (proposed_element.length === 1) { + return proposed; + } else { + // Some sites change ID but name= stays the same, we can hit it if we know the index + // Find all the elements that match and work out the input[n] + var n = Array.from(proposed_element).indexOf(el); + // Return a Playwright selector for nthinput[name=zipcode] + return proposed + " >> nth=" + n; + } } } - } - // Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4 - while (r.parentNode) { - if (depth === 5) { - break; - } - if ('' !== r.id) { - chained_css.unshift("#" + CSS.escape(r.id)); - final_selector = chained_css.join(' > '); - // Be sure theres only one, some sites have multiples of the same ID tag :-( - if (window.document.querySelectorAll(final_selector).length === 1) { - return final_selector; + // Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4 + while (r.parentNode) { + if (depth === 5) { + break; } - return null; - } else { - chained_css.unshift(r.tagName.toLowerCase()); + if ('' !== r.id) { + chained_css.unshift("#" + CSS.escape(r.id)); + final_selector = chained_css.join(' > '); + // Be sure theres only one, some sites have multiples of the same ID tag :-( + if (window.document.querySelectorAll(final_selector).length === 1) { + return final_selector; + } + return null; + } else { + chained_css.unshift(r.tagName.toLowerCase()); + } + r = r.parentNode; + depth += 1; } - r = r.parentNode; - depth += 1; + return null; } - return null; -} // @todo - if it's SVG or IMG, go into image diff mode -// %ELEMENTS% replaced at injection time because different interfaces use it with different settings -var size_pos = []; + var size_pos = []; // after page fetch, inject this JS // build a map of all elements and their positions (maybe that only include text?) -var bbox; -console.log("Scanning %ELEMENTS%"); + var bbox; + console.log(`Scanning for "${visualselector_xpath_selectors}"`); -function collectVisibleElements(parent, visibleElements) { - if (!parent) return; // Base case: if parent is null or undefined, return + function collectVisibleElements(parent, visibleElements) { + if (!parent) return; // Base case: if parent is null or undefined, return - // Add the parent itself to the visible elements array if it's of the specified types - const tagName = parent.tagName.toLowerCase(); - if ("%ELEMENTS%".split(',').includes(tagName)) { - visibleElements.push(parent); - } + // Add the parent itself to the visible elements array if it's of the specified types + const tagName = parent.tagName.toLowerCase(); + if (visualselector_xpath_selectors.split(',').includes(tagName)) { + visibleElements.push(parent); + } - // Iterate over the parent's children - const children = parent.children; - for (let i = 0; i < children.length; i++) { - const child = children[i]; - const computedStyle = window.getComputedStyle(child); - - if ( - child.nodeType === Node.ELEMENT_NODE && - computedStyle.display !== 'none' && - computedStyle.visibility !== 'hidden' && - child.offsetWidth >= 0 && - child.offsetHeight >= 0 && - computedStyle.contentVisibility !== 'hidden' - ) { - // If the child is an element and is visible, recursively collect visible elements - collectVisibleElements(child, visibleElements); + // Iterate over the parent's children + const children = parent.children; + for (let i = 0; i < children.length; i++) { + const child = children[i]; + const computedStyle = window.getComputedStyle(child); + + if ( + child.nodeType === Node.ELEMENT_NODE && + computedStyle.display !== 'none' && + computedStyle.visibility !== 'hidden' && + child.offsetWidth >= 0 && + child.offsetHeight >= 0 && + computedStyle.contentVisibility !== 'hidden' + ) { + // If the child is an element and is visible, recursively collect visible elements + collectVisibleElements(child, visibleElements); + } } } -} // Create an array to hold the visible elements -const visibleElementsArray = []; + const visibleElementsArray = []; // Call collectVisibleElements with the starting parent element -collectVisibleElements(document.body, visibleElementsArray); + collectVisibleElements(document.body, visibleElementsArray); -visibleElementsArray.forEach(function (element) { + visibleElementsArray.forEach(function (element) { - bbox = element.getBoundingClientRect(); + bbox = element.getBoundingClientRect(); - // Skip really small ones, and where width or height ==0 - if (bbox['width'] * bbox['height'] < 10) { - return - } + // Skip really small ones, and where width or height ==0 + if (bbox['width'] * bbox['height'] < 10) { + return + } - // Don't include elements that are offset from canvas - if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) { - return - } + // Don't include elements that are offset from canvas + if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) { + return + } - // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes - // it should not traverse when we know we can anchor off just an ID one level up etc.. - // maybe, get current class or id, keep traversing up looking for only class or id until there is just one match + // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes + // it should not traverse when we know we can anchor off just an ID one level up etc.. + // maybe, get current class or id, keep traversing up looking for only class or id until there is just one match - // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us. - xpath_result = false; - try { - var d = findUpTag(element); - if (d) { - xpath_result = d; - } - } catch (e) { - console.log(e); - } - // You could swap it and default to getXpath and then try the smarter one - // default back to the less intelligent one - if (!xpath_result) { + // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us. + xpath_result = false; try { - // I've seen on FB and eBay that this doesnt work - // ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), :67:20) at UtilityScript.evaluate (:159:18) at UtilityScript. (:1:44) - xpath_result = getxpath(element); + var d = findUpTag(element); + if (d) { + xpath_result = d; + } } catch (e) { console.log(e); - return } - } + // You could swap it and default to getXpath and then try the smarter one + // default back to the less intelligent one + if (!xpath_result) { + try { + // I've seen on FB and eBay that this doesnt work + // ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), :67:20) at UtilityScript.evaluate (:159:18) at UtilityScript. (:1:44) + xpath_result = getxpath(element); + } catch (e) { + console.log(e); + return + } + } - let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now + let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now - let text = element.textContent.trim().slice(0, 30).trim(); - while (/\n{2,}|\t{2,}/.test(text)) { - text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t') - } + let text = element.textContent.trim().slice(0, 30).trim(); + while (/\n{2,}|\t{2,}/.test(text)) { + text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t') + } - // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. - const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ; - const computedStyle = window.getComputedStyle(element); - - size_pos.push({ - xpath: xpath_result, - width: Math.round(bbox['width']), - height: Math.round(bbox['height']), - left: Math.floor(bbox['left']), - top: Math.floor(bbox['top']) + scroll_y, - // tagName used by Browser Steps - tagName: (element.tagName) ? element.tagName.toLowerCase() : '', - // tagtype used by Browser Steps - tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', - isClickable: computedStyle.cursor === "pointer", - // Used by the keras trainer - fontSize: computedStyle.getPropertyValue('font-size'), - fontWeight: computedStyle.getPropertyValue('font-weight'), - hasDigitCurrency: hasDigitCurrency, - label: label, - }); + // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. + const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6))) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text); + const computedStyle = window.getComputedStyle(element); -}); + if (Math.floor(bbox['top']) + scroll_y > max_height) { + return + } + + size_pos.push({ + xpath: xpath_result, + width: Math.round(bbox['width']), + height: Math.round(bbox['height']), + left: Math.floor(bbox['left']), + top: Math.floor(bbox['top']) + scroll_y, + // tagName used by Browser Steps + tagName: (element.tagName) ? element.tagName.toLowerCase() : '', + // tagtype used by Browser Steps + tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', + isClickable: computedStyle.cursor === "pointer", + // Used by the keras trainer + fontSize: computedStyle.getPropertyValue('font-size'), + fontWeight: computedStyle.getPropertyValue('font-weight'), + hasDigitCurrency: hasDigitCurrency, + label: label, + }); + + }); // Inject the current one set in the include_filters, which may be a CSS rule // used for displaying the current one in VisualSelector, where its not one we generated. -if (include_filters.length) { - let results; - // Foreach filter, go and find it on the page and add it to the results so we can visualise it again - for (const f of include_filters) { - bbox = false; - q = false; - - if (!f.length) { - console.log("xpath_element_scraper: Empty filter, skipping"); - continue; - } - - try { - // is it xpath? - if (f.startsWith('/') || f.startsWith('xpath')) { - var qry_f = f.replace(/xpath(:|\d:)/, '') - console.log("[xpath] Scanning for included filter " + qry_f) - let xpathResult = document.evaluate(qry_f, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); - results = []; - for (let i = 0; i < xpathResult.snapshotLength; i++) { - results.push(xpathResult.snapshotItem(i)); - } - } else { - console.log("[css] Scanning for included filter " + f) - console.log("[css] Scanning for included filter " + f); - results = document.querySelectorAll(f); + if (include_filters.length) { + let results; + // Foreach filter, go and find it on the page and add it to the results so we can visualise it again + for (const f of include_filters) { + bbox = false; + q = false; + + if (!f.length) { + console.log("xpath_element_scraper: Empty filter, skipping"); + continue; } - } catch (e) { - // Maybe catch DOMException and alert? - console.log("xpath_element_scraper: Exception selecting element from filter " + f); - console.log(e); - } - if (results != null && results.length) { - - // Iterate over the results - results.forEach(node => { - // Try to resolve //something/text() back to its /something so we can atleast get the bounding box - try { - if (typeof node.nodeName == 'string' && node.nodeName === '#text') { - node = node.parentElement + try { + // is it xpath? + if (f.startsWith('/') || f.startsWith('xpath')) { + var qry_f = f.replace(/xpath(:|\d:)/, '') + console.log("[xpath] Scanning for included filter " + qry_f) + let xpathResult = document.evaluate(qry_f, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + results = []; + for (let i = 0; i < xpathResult.snapshotLength; i++) { + results.push(xpathResult.snapshotItem(i)); } - } catch (e) { - console.log(e) - console.log("xpath_element_scraper: #text resolver") + } else { + console.log("[css] Scanning for included filter " + f) + console.log("[css] Scanning for included filter " + f); + results = document.querySelectorAll(f); } + } catch (e) { + // Maybe catch DOMException and alert? + console.log("xpath_element_scraper: Exception selecting element from filter " + f); + console.log(e); + } - // #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element. - if (typeof node.getBoundingClientRect == 'function') { - bbox = node.getBoundingClientRect(); - console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y) - } else { + if (results != null && results.length) { + + // Iterate over the results + results.forEach(node => { + // Try to resolve //something/text() back to its /something so we can atleast get the bounding box try { - // Try and see we can find its ownerElement - bbox = node.ownerElement.getBoundingClientRect(); - console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y) + if (typeof node.nodeName == 'string' && node.nodeName === '#text') { + node = node.parentElement + } } catch (e) { console.log(e) - console.log("xpath_element_scraper: error looking up q.ownerElement") + console.log("xpath_element_scraper: #text resolver") } - } - if (bbox && bbox['width'] > 0 && bbox['height'] > 0) { - size_pos.push({ - xpath: f, - width: parseInt(bbox['width']), - height: parseInt(bbox['height']), - left: parseInt(bbox['left']), - top: parseInt(bbox['top']) + scroll_y, - highlight_as_custom_filter: true - }); - } - }); + // #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element. + if (typeof node.getBoundingClientRect == 'function') { + bbox = node.getBoundingClientRect(); + console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y) + } else { + try { + // Try and see we can find its ownerElement + bbox = node.ownerElement.getBoundingClientRect(); + console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y) + } catch (e) { + console.log(e) + console.log("xpath_element_scraper: error looking up q.ownerElement") + } + } + + if (bbox && bbox['width'] > 0 && bbox['height'] > 0) { + size_pos.push({ + xpath: f, + width: parseInt(bbox['width']), + height: parseInt(bbox['height']), + left: parseInt(bbox['left']), + top: parseInt(bbox['top']) + scroll_y, + highlight_as_custom_filter: true + }); + } + }); + } } } -} // Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area // so that we dont select the wrapping element by mistake and be unable to select what we want -size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1) + size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1) + +// browser_width required for proper scaling in the frontend + // Return as a string to save playwright for juggling thousands of objects + return JSON.stringify({'size_pos': size_pos, 'browser_width': window.innerWidth}); +} -// Window.width required for proper scaling in the frontend -return {'size_pos': size_pos, 'browser_width': window.innerWidth}; diff --git a/changedetectionio/content_fetchers/screenshot_handler.py b/changedetectionio/content_fetchers/screenshot_handler.py new file mode 100644 index 00000000000..04133b864a6 --- /dev/null +++ b/changedetectionio/content_fetchers/screenshot_handler.py @@ -0,0 +1,73 @@ +# Pages with a vertical height longer than this will use the 'stitch together' method. + +# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices). +# - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits. +# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer. + +from loguru import logger + +from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, SCREENSHOT_DEFAULT_QUALITY + + +def stitch_images_worker(pipe_conn, chunks_bytes, original_page_height, capture_height): + import os + import io + from PIL import Image, ImageDraw, ImageFont + + try: + + # Load images from byte chunks + images = [Image.open(io.BytesIO(b)) for b in chunks_bytes] + total_height = sum(im.height for im in images) + max_width = max(im.width for im in images) + + # Create stitched image + stitched = Image.new('RGB', (max_width, total_height)) + y_offset = 0 + for im in images: + stitched.paste(im, (0, y_offset)) + y_offset += im.height + + # Draw caption on top (overlaid, not extending canvas) + draw = ImageDraw.Draw(stitched) + + + caption_text = f"WARNING: Screenshot was {original_page_height}px but trimmed to {capture_height}px because it was too long" + padding = 10 + font_size = 35 + font_color = (255, 0, 0) + background_color = (255, 255, 255) + + + # Try to load a proper font + try: + font = ImageFont.truetype("arial.ttf", font_size) + except IOError: + font = ImageFont.load_default() + + bbox = draw.textbbox((0, 0), caption_text, font=font) + text_width = bbox[2] - bbox[0] + text_height = bbox[3] - bbox[1] + + # Draw white rectangle background behind text + rect_top = 0 + rect_bottom = text_height + 2 * padding + draw.rectangle([(0, rect_top), (max_width, rect_bottom)], fill=background_color) + + # Draw text centered horizontally, 10px padding from top of the rectangle + text_x = (max_width - text_width) // 2 + text_y = padding + draw.text((text_x, text_y), caption_text, font=font, fill=font_color) + + # Encode and send image + output = io.BytesIO() + stitched.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY))) + pipe_conn.send_bytes(output.getvalue()) + + stitched.close() + except Exception as e: + pipe_conn.send(f"error:{e}") + finally: + pipe_conn.close() + + diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index 7ed5e2c100e..5b7201186c0 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -394,7 +394,7 @@ def static_content(group, filename): response.headers['Content-Type'] = 'application/json' response.headers['Content-Encoding'] = 'deflate' else: - logger.error(f'Request elements.deflate at "{watch_directory}" but was notfound.') + logger.error(f'Request elements.deflate at "{watch_directory}" but was not found.') abort(404) if response: diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 3fbe8f94a72..86e93983f56 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -553,7 +553,10 @@ def save_xpath_data(self, data, as_error=False): self.ensure_data_dir_exists() with open(target_path, 'wb') as f: - f.write(zlib.compress(json.dumps(data).encode())) + if not isinstance(data, str): + f.write(zlib.compress(json.dumps(data).encode())) + else: + f.write(zlib.compress(data.encode())) f.close() # Save as PNG, PNG is larger but better for doing visual diff in the future diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 38a8f73a61e..2e9e2294dc3 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -592,6 +592,7 @@ def run(self): self.current_uuid = None # Done self.q.task_done() + update_handler = None logger.debug(f"Watch {uuid} done in {time.time()-fetch_start_time:.2f}s") # Give the CPU time to interrupt