Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions changedetectionio/content_fetchers/playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ def capture_full_page(page):
p.join()
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")

# Explicit cleanup
del screenshot_chunks
del p
del parent_conn, child_conn
screenshot_chunks = None
return screenshot

Expand Down Expand Up @@ -286,12 +289,28 @@ def run(self,
pass

# Clean up resources properly
context.close()
context = None
try:
self.page.request_gc()
except:
pass

self.page.close()
try:
self.page.close()
except:
pass
self.page = None

browser.close()
borwser = None
try:
context.close()
except:
pass
context = None

try:
browser.close()
except:
pass
browser = None



20 changes: 16 additions & 4 deletions changedetectionio/html_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,9 @@ def repl(m):

return re.sub(pattern, repl, html_content)

def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:

def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):

from inscriptis import get_text
from inscriptis.model.config import ParserConfig

Expand Down Expand Up @@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
html_content = re.sub(r'</title>', r'</h1>', html_content)

text_content = get_text(html_content, config=parser_config)

return text_content

conn.send(text_content)
conn.close()

# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from multiprocessing import Process, Pipe

parent_conn, child_conn = Pipe()
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
p.start()
text = parent_conn.recv()
p.join()
return text

# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):
Expand Down