From 3a102becf4b45030d88d168ba844720f273e499c Mon Sep 17 00:00:00 2001 From: Aman Raj Date: Fri, 3 Oct 2025 19:28:20 +0530 Subject: [PATCH 1/7] Feat: Add image extraction for webpages (media parsing) --- mediaParser.js | 44 ++++++++++++++++++++++++++++++++++++++++++++ package.json | 4 +++- 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 mediaParser.js diff --git a/mediaParser.js b/mediaParser.js new file mode 100644 index 000000000..a2dcee8e7 --- /dev/null +++ b/mediaParser.js @@ -0,0 +1,44 @@ +const axios = require('axios'); +const cheerio = require('cheerio'); +const { URL } = require('url'); + +async function extractImages(url) { + try { + // 1. Fetch HTML + const { data: html } = await axios.get(url); + + // 2. Load HTML into cheerio + const $ = cheerio.load(html); + + const images = []; + const seen = new Set(); // to track duplicates + + // 3. Loop through each tag + $('img').each((index, element) => { + let src = $(element).attr('src'); + const alt = $(element).attr('alt') || ''; + + if (src) { + // 4. Convert relative URLs to absolute URLs + try { + src = new URL(src, url).href; + } catch { + // skip invalid URLs + return; + } + + // 5. Skip duplicates + if (!seen.has(src)) { + seen.add(src); + images.push({ url: src, altText: alt }); + } + } + }); + + return images; + + } catch (error) { + console.log('Oops! Something went wrong while fetching images:', error.message); + return []; + } +} \ No newline at end of file diff --git a/package.json b/package.json index a5eab10c6..c9d144b71 100644 --- a/package.json +++ b/package.json @@ -24,10 +24,11 @@ "@types/react-dom": "^18.0.1", "@types/uuid": "^8.3.4", "airtable": "^0.12.2", - "axios": "^1.9.0", + "axios": "^1.12.2", "bcrypt": "^5.1.1", "body-parser": "^1.20.3", "buffer": "^6.0.3", + "cheerio": "^1.1.2", "connect-pg-simple": "^10.0.0", "cookie-parser": "^1.4.6", "cors": "^2.8.5", @@ -80,6 +81,7 @@ "styled-components": "^5.3.3", "swagger-jsdoc": "^6.2.8", "swagger-ui-express": "^5.0.1", + "tesseract.js": "^6.0.1", "typedoc": "^0.23.8", "typescript": "^4.6.3", "uuid": "^8.3.2", From df2efee3f715ea8c9445a883dda55f782e0927a1 Mon Sep 17 00:00:00 2001 From: Aman Raj Date: Fri, 3 Oct 2025 22:24:29 +0530 Subject: [PATCH 2/7] Refactor: Improve image extraction with ES6, validation, srcset, and better error handling --- mediaParser.js | 74 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/mediaParser.js b/mediaParser.js index a2dcee8e7..662eff889 100644 --- a/mediaParser.js +++ b/mediaParser.js @@ -1,36 +1,66 @@ -const axios = require('axios'); -const cheerio = require('cheerio'); -const { URL } = require('url'); +import axios from 'axios'; +import * as cheerio from 'cheerio'; +import { URL } from 'url'; +import logger from './logger'; // Adjust path if necessary async function extractImages(url) { + // Input validation + if (!url || typeof url !== 'string') { + throw new TypeError('URL must be a non-empty string'); + } + try { - // 1. Fetch HTML - const { data: html } = await axios.get(url); + // Fetch HTML with proper axios config + const { data: html } = await axios.get(url, { + timeout: 10000, + maxContentLength: 10 * 1024 * 1024, + maxBodyLength: 10 * 1024 * 1024, + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; MaxunBot/1.0; +https://maxun.dev)' + }, + maxRedirects: 5 + }); - // 2. Load HTML into cheerio - const $ = cheerio.load(html); + // Load HTML + const $ = cheerio.load(html, { + decodeEntities: true, + normalizeWhitespace: false + }); const images = []; - const seen = new Set(); // to track duplicates + const seen = new Set(); - // 3. Loop through each tag $('img').each((index, element) => { - let src = $(element).attr('src'); const alt = $(element).attr('alt') || ''; - + + // Handle src + let src = $(element).attr('src'); if (src) { - // 4. Convert relative URLs to absolute URLs try { - src = new URL(src, url).href; + const absoluteUrl = new URL(src, url).href; + if (!seen.has(absoluteUrl)) { + seen.add(absoluteUrl); + images.push({ url: absoluteUrl, altText: alt }); + } } catch { - // skip invalid URLs - return; + logger.warn(`Invalid image URL: ${src}`); } + } - // 5. Skip duplicates - if (!seen.has(src)) { - seen.add(src); - images.push({ url: src, altText: alt }); + // Handle srcset + const srcset = $(element).attr('srcset'); + if (srcset) { + const srcsetUrls = srcset.split(',').map(s => s.trim().split(/\s+/)[0]); + for (const srcsetUrl of srcsetUrls) { + try { + const absoluteUrl = new URL(srcsetUrl, url).href; + if (!seen.has(absoluteUrl)) { + seen.add(absoluteUrl); + images.push({ url: absoluteUrl, altText: alt }); + } + } catch { + logger.warn(`Invalid srcset URL: ${srcsetUrl}`); + } } } }); @@ -38,7 +68,7 @@ async function extractImages(url) { return images; } catch (error) { - console.log('Oops! Something went wrong while fetching images:', error.message); - return []; + logger.error('Failed to extract images', { url, error: error.message }); + throw new Error(`Failed to extract images from ${url}: ${error.message}`); } -} \ No newline at end of file +} From caaf13164c4d3bdf8246ed03e8e2676aa32e6797 Mon Sep 17 00:00:00 2001 From: Aman Raj Date: Fri, 3 Oct 2025 22:35:02 +0530 Subject: [PATCH 3/7] Refactor: Improve image extraction - Switch to ES6 module syntax (import/export) - Add URL input validation - Configure axios for timeout, headers, content limits - Validate Content-Type before parsing HTML - Handle srcset with empty URL filtering - Remove duplicate images - Use Winston logger for warnings and errors - Add human-friendly JSDoc comments --- mediaParser.js | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/mediaParser.js b/mediaParser.js index 662eff889..60f6c7a27 100644 --- a/mediaParser.js +++ b/mediaParser.js @@ -3,15 +3,29 @@ import * as cheerio from 'cheerio'; import { URL } from 'url'; import logger from './logger'; // Adjust path if necessary +/** + * Fetches and extracts all images from a webpage, including responsive ones. + * This includes regular tags and srcset URLs used for different screen sizes. + * + * @param {string} url - The webpage URL to extract images from. + * Must be a valid, non-empty string. + * @returns {Array} - An array of objects, each containing: + * { + * url: string, // The absolute URL of the image + * altText: string // The alt text of the image (if any) + * } + * @throws {TypeError} - If the URL is missing or not a string. + * @throws {Error} - If the fetch fails or the response is not HTML. + */ async function extractImages(url) { - // Input validation + // 1. Validate input if (!url || typeof url !== 'string') { throw new TypeError('URL must be a non-empty string'); } try { - // Fetch HTML with proper axios config - const { data: html } = await axios.get(url, { + // 2. Fetch HTML with axios configured for reliability + const response = await axios.get(url, { timeout: 10000, maxContentLength: 10 * 1024 * 1024, maxBodyLength: 10 * 1024 * 1024, @@ -21,7 +35,15 @@ async function extractImages(url) { maxRedirects: 5 }); - // Load HTML + // 3. Validate content-type + const contentType = response.headers['content-type'] || ''; + if (!contentType.includes('text/html')) { + throw new Error(`Expected HTML but got ${contentType}`); + } + + const html = response.data; + + // 4. Load HTML into cheerio const $ = cheerio.load(html, { decodeEntities: true, normalizeWhitespace: false @@ -30,10 +52,11 @@ async function extractImages(url) { const images = []; const seen = new Set(); + // 5. Extract tags $('img').each((index, element) => { const alt = $(element).attr('alt') || ''; - - // Handle src + + // 5a. Handle src let src = $(element).attr('src'); if (src) { try { @@ -47,10 +70,13 @@ async function extractImages(url) { } } - // Handle srcset + // 5b. Handle srcset (responsive images) const srcset = $(element).attr('srcset'); if (srcset) { - const srcsetUrls = srcset.split(',').map(s => s.trim().split(/\s+/)[0]); + const srcsetUrls = srcset.split(',') + .map(s => s.trim().split(/\s+/)[0]) + .filter(Boolean); // Remove empty strings + for (const srcsetUrl of srcsetUrls) { try { const absoluteUrl = new URL(srcsetUrl, url).href; @@ -68,7 +94,8 @@ async function extractImages(url) { return images; } catch (error) { + // Log errors and throw for the caller logger.error('Failed to extract images', { url, error: error.message }); throw new Error(`Failed to extract images from ${url}: ${error.message}`); } -} +} \ No newline at end of file From d24496c19205356d3d6e143c79753539e763d75d Mon Sep 17 00:00:00 2001 From: Aman Raj Date: Fri, 3 Oct 2025 22:41:43 +0530 Subject: [PATCH 4/7] Refactor: Enhance image extraction - Export extractImages function for module usage - Preserve original error stack trace using 'cause' - Validate Content-Type before parsing HTML - Filter out data: URLs to avoid bloating results - Handle tags for responsive images - Deduplicate images - Maintain Winston logging for errors and warnings - Human-friendly JSDoc comments and ES6 syntax --- mediaParser.js | 60 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/mediaParser.js b/mediaParser.js index 60f6c7a27..c8b0b192e 100644 --- a/mediaParser.js +++ b/mediaParser.js @@ -1,15 +1,17 @@ +// mediaParser.js + import axios from 'axios'; import * as cheerio from 'cheerio'; import { URL } from 'url'; import logger from './logger'; // Adjust path if necessary /** - * Fetches and extracts all images from a webpage, including responsive ones. - * This includes regular tags and srcset URLs used for different screen sizes. + * Fetches and extracts all images from a webpage, including responsive images. + * This includes regular tags, srcset URLs, and tags within elements. * * @param {string} url - The webpage URL to extract images from. * Must be a valid, non-empty string. - * @returns {Array} - An array of objects, each containing: + * @returns {Array} - An array of objects: * { * url: string, // The absolute URL of the image * altText: string // The alt text of the image (if any) @@ -18,13 +20,12 @@ import logger from './logger'; // Adjust path if necessary * @throws {Error} - If the fetch fails or the response is not HTML. */ async function extractImages(url) { - // 1. Validate input if (!url || typeof url !== 'string') { throw new TypeError('URL must be a non-empty string'); } try { - // 2. Fetch HTML with axios configured for reliability + // Fetch webpage with axios const response = await axios.get(url, { timeout: 10000, maxContentLength: 10 * 1024 * 1024, @@ -35,15 +36,13 @@ async function extractImages(url) { maxRedirects: 5 }); - // 3. Validate content-type + // Validate that content is HTML const contentType = response.headers['content-type'] || ''; if (!contentType.includes('text/html')) { throw new Error(`Expected HTML but got ${contentType}`); } const html = response.data; - - // 4. Load HTML into cheerio const $ = cheerio.load(html, { decodeEntities: true, normalizeWhitespace: false @@ -52,16 +51,15 @@ async function extractImages(url) { const images = []; const seen = new Set(); - // 5. Extract tags + // Extract tags $('img').each((index, element) => { const alt = $(element).attr('alt') || ''; - - // 5a. Handle src let src = $(element).attr('src'); + if (src) { try { const absoluteUrl = new URL(src, url).href; - if (!seen.has(absoluteUrl)) { + if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { seen.add(absoluteUrl); images.push({ url: absoluteUrl, altText: alt }); } @@ -70,17 +68,17 @@ async function extractImages(url) { } } - // 5b. Handle srcset (responsive images) + // Handle srcset (responsive images) const srcset = $(element).attr('srcset'); if (srcset) { const srcsetUrls = srcset.split(',') .map(s => s.trim().split(/\s+/)[0]) - .filter(Boolean); // Remove empty strings + .filter(Boolean); for (const srcsetUrl of srcsetUrls) { try { const absoluteUrl = new URL(srcsetUrl, url).href; - if (!seen.has(absoluteUrl)) { + if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { seen.add(absoluteUrl); images.push({ url: absoluteUrl, altText: alt }); } @@ -91,11 +89,35 @@ async function extractImages(url) { } }); + // Extract tags inside elements + $('picture source').each((i, element) => { + const srcset = $(element).attr('srcset'); + if (srcset) { + const srcsetUrls = srcset.split(',') + .map(s => s.trim().split(/\s+/)[0]) + .filter(Boolean); + + for (const srcsetUrl of srcsetUrls) { + try { + const absoluteUrl = new URL(srcsetUrl, url).href; + if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { + seen.add(absoluteUrl); + images.push({ url: absoluteUrl, altText: '' }); + } + } catch { + logger.warn(`Invalid srcset URL in : ${srcsetUrl}`); + } + } + } + }); + return images; } catch (error) { - // Log errors and throw for the caller - logger.error('Failed to extract images', { url, error: error.message }); - throw new Error(`Failed to extract images from ${url}: ${error.message}`); + // Preserve original stack trace + throw new Error(`Failed to extract images from ${url}`, { cause: error }); } -} \ No newline at end of file +} + +// Export function for other modules +export { extractImages }; From 6624bbafb93763107e1b95163862f21853de20d8 Mon Sep 17 00:00:00 2001 From: Aman Raj Date: Fri, 10 Oct 2025 23:44:37 +0530 Subject: [PATCH 5/7] feat(issue #164): integrate media extraction into recorder flow --- maxun-core/src/browserSide/scraper.js | 125 +++++++++++++++++- package.json | 1 + .../src/browser-management/inputHandlers.ts | 29 ++++ .../workflow-management/classes/Generator.ts | 30 +++++ .../recorder/DOMBrowserRenderer.tsx | 23 ++++ 5 files changed, 207 insertions(+), 1 deletion(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index fdf1ff9c9..756c630fd 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -1510,4 +1510,127 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return results; }; -})(window); \ No newline at end of file +})(window); + +/* Media extraction support - listens for clicks on media elements and extracts text. + Sends { url, tag, selector, extractedText } via postMessage to the parent window. */ + +// Extract text from image: alt/title first, then OCR via Tesseract if available +async function extractImageText(img) { + try { + const altTitle = (img.alt || img.title || '').trim(); + if (altTitle) return altTitle; + + if (window.Tesseract && typeof window.Tesseract.recognize === 'function') { + // Use the image src (may be data: or remote); ignore if data: that contains large chunks + const src = img.currentSrc || img.src || ''; + if (!src) return ''; + try { + const result = await window.Tesseract.recognize(src, 'eng'); + return (result?.data?.text || '').trim(); + } catch (e) { + return ''; + } + } + } catch (e) { + return ''; + } + return ''; +} + +// Extract text from PDF using pdf.js if available +async function extractPdfText(url) { + try { + if (!window.pdfjsLib) return ''; + const loadingTask = window.pdfjsLib.getDocument(url); + const pdf = await loadingTask.promise; + let text = ''; + for (let i = 1; i <= pdf.numPages; i++) { + // eslint-disable-next-line no-await-in-loop + const page = await pdf.getPage(i); + // eslint-disable-next-line no-await-in-loop + const content = await page.getTextContent(); + text += content.items.map((it) => it.str).join(' ') + '\n'; + } + return text.trim(); + } catch (e) { + return ''; + } +} + +// Helper to generate structural selector if function is available +function structuralSelector(el) { + try { + if (typeof GetSelectorStructural === 'function') return GetSelectorStructural(el); + } catch (e) { + // fallthrough + } + return ''; +} + +// Click listener for media elements +document.addEventListener('click', async (ev) => { + try { + const el = ev.target; + if (!el || !el.tagName) return; + const tag = el.tagName.toLowerCase(); + let url = ''; + let selector = structuralSelector(el); + let extractedText = ''; + + if (tag === 'img') { + url = el.currentSrc || el.src || ''; + extractedText = (el.alt || el.title || '').trim(); + if (!extractedText) extractedText = await extractImageText(el); + } else if (tag === 'iframe' || tag === 'embed') { + url = el.src || el.data || ''; + if (url && /\.pdf(\?|$)/i.test(url)) { + extractedText = await extractPdfText(url); + } + } else if (tag === 'object') { + // style + url = el.data || ''; + if (url && /\.pdf(\?|$)/i.test(url)) { + extractedText = await extractPdfText(url); + } + } + + if (url && extractedText) { + // Post to parent so the recorder frontend (or wrapper) can relay it to server socket + try { + window.parent.postMessage({ + type: 'maxun:media-extracted', + url, + tag, + selector, + extractedText + }, '*'); + } catch (e) { + // ignore + } + } + } catch (e) { + // swallow + } +}); + +// Load Tesseract and PDF.js if not already present (CDN). +if (!window.Tesseract) { + const s = document.createElement('script'); + s.src = 'https://cdn.jsdelivr.net/npm/tesseract.js@4.0.2/dist/tesseract.min.js'; + s.async = true; + document.head.appendChild(s); +} +if (!window.pdfjsLib) { + const s2 = document.createElement('script'); + s2.src = 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/build/pdf.min.js'; + s2.async = true; + s2.onload = () => { + try { + // eslint-disable-next-line no-undef + window.pdfjsLib = window['pdfjs-dist/build/pdf']; + if (window.pdfjsLib) window.pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/build/pdf.worker.min.js'; + } catch (e) {} + }; + document.head.appendChild(s2); +} \ No newline at end of file diff --git a/package.json b/package.json index c9d144b71..4d62fcb7c 100644 --- a/package.json +++ b/package.json @@ -56,6 +56,7 @@ "minio": "^8.0.1", "moment-timezone": "^0.5.45", "node-cron": "^3.0.3", + "pdfjs-dist": "^5.4.296", "pg": "^8.13.0", "pg-boss": "^10.1.6", "pkce-challenge": "^4.1.0", diff --git a/server/src/browser-management/inputHandlers.ts b/server/src/browser-management/inputHandlers.ts index c014af3d9..a3859b429 100644 --- a/server/src/browser-management/inputHandlers.ts +++ b/server/src/browser-management/inputHandlers.ts @@ -608,6 +608,34 @@ const handleGoForward = async (activeBrowser: RemoteBrowser, page: Page) => { } }; +/** + * Handle media extracted event forwarded from client (via postMessage relay). + * data: { url, tag, selector, extractedText } + */ +const onMediaExtracted = async (data: { url: string; tag: string; selector: string; extractedText: string }, userId: string) => { + logger.log('debug', 'Handling media-extracted event emitted from client'); + await handleWrapper(handleMediaExtracted, userId, data); +} + +const handleMediaExtracted = async (activeBrowser: RemoteBrowser, page: Page, data: { url: string; tag: string; selector: string; extractedText: string }) => { + try { + if (page.isClosed()) { + logger.log("debug", `Ignoring media-extracted event: page is closed`); + return; + } + const generator = activeBrowser.generator; + if (generator && typeof generator.handleMediaExtracted === 'function') { + await generator.handleMediaExtracted(data, page); + } else { + logger.log('warn', 'Generator does not implement handleMediaExtracted'); + } + logger.log('debug', `Media extracted added: ${data.url}`); + } catch (e) { + const { message } = e as Error; + logger.log('warn', `Error handling media-extracted event: ${message}`); + } +} + /** * Handles the click action event. * @param activeBrowser - the active remote browser {@link RemoteBrowser} @@ -851,6 +879,7 @@ const registerInputHandlers = (socket: Socket, userId: string) => { socket.on("dom:click", (data) => onDOMClickAction(data, userId)); socket.on("dom:keypress", (data) => onDOMKeyboardAction(data, userId)); socket.on("dom:addpair", (data) => onDOMWorkflowPair(data, userId)); + socket.on("dom:media-extracted", (data) => onMediaExtracted(data, userId)); }; export default registerInputHandlers; diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index a5bc2edc4..c90407c16 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -149,6 +149,36 @@ export class WorkflowGenerator { }); } + /** + * Handle media extraction event from browser-side snippet. + * Appends a media event object to workflowRecord.events. + */ + public async handleMediaExtracted(data: { url: string; tag: string; selector: string; extractedText: string }, page: Page) { + try { + if (!this.workflowRecord) this.workflowRecord = { workflow: [] } as WorkflowFile; + // Ensure events array exists on workflowRecord (non-standard addition) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (this.workflowRecord as any).events = (this.workflowRecord as any).events || []; + (this.workflowRecord as any).events.push({ + type: 'media', + url: data.url, + tag: data.tag, + selector: data.selector, + extractedText: data.extractedText, + timestamp: Date.now(), + }); + + // notify client of new event if needed + try { + this.socket.emit('workflow:media-added', { url: data.url, selector: data.selector }); + } catch (e) { + // ignore + } + } catch (e) { + logger.log('warn', `handleMediaExtracted failed: ${(e as Error).message}`); + } + } + /** * Registers the event handlers for all generator-related events on the socket. * @param socket The socket used to communicate with the client. diff --git a/src/components/recorder/DOMBrowserRenderer.tsx b/src/components/recorder/DOMBrowserRenderer.tsx index 7fcafdeb4..9f8d035a8 100644 --- a/src/components/recorder/DOMBrowserRenderer.tsx +++ b/src/components/recorder/DOMBrowserRenderer.tsx @@ -199,6 +199,29 @@ export const DOMBrowserRenderer: React.FC = ({ clientSelectorGenerator.setPaginationMode(paginationMode); }, [getList, listSelector, paginationMode]); + // Relay media-extracted postMessage from the iframe to server socket + useEffect(() => { + const handler = (ev: MessageEvent) => { + try { + const data = ev.data; + if (!data || data.type !== 'maxun:media-extracted') return; + const payload = { + url: data.url, + tag: data.tag, + selector: data.selector, + extractedText: data.extractedText, + }; + if (socket && socket.emit) { + socket.emit('dom:media-extracted', payload); + } + } catch (e) { + // ignore + } + }; + window.addEventListener('message', handler); + return () => window.removeEventListener('message', handler); + }, [socket]); + useEffect(() => { if (listSelector) { clientSelectorGenerator.setListSelector(listSelector); From 3a68e600b688d829467710c1f44b922314a47c79 Mon Sep 17 00:00:00 2001 From: Aman Raj Date: Sat, 11 Oct 2025 00:02:29 +0530 Subject: [PATCH 6/7] feat(issue #164): integrate media extraction into recorder flow --- .../recorder/DOMBrowserRenderer.tsx | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/components/recorder/DOMBrowserRenderer.tsx b/src/components/recorder/DOMBrowserRenderer.tsx index 9f8d035a8..9c576a1bd 100644 --- a/src/components/recorder/DOMBrowserRenderer.tsx +++ b/src/components/recorder/DOMBrowserRenderer.tsx @@ -205,6 +205,27 @@ export const DOMBrowserRenderer: React.FC = ({ try { const data = ev.data; if (!data || data.type !== 'maxun:media-extracted') return; + + // Ensure the message comes from the recorded iframe only + const iframeWindow = iframeRef.current?.contentWindow || null; + if (ev.source !== iframeWindow) { + // Not from the recorded iframe - ignore + return; + } + + // If snapshot.baseUrl is available, validate origin when possible + try { + if (snapshot?.baseUrl) { + const expectedOrigin = new URL(snapshot.baseUrl).origin; + if (ev.origin && ev.origin !== 'null' && ev.origin !== expectedOrigin) { + // origin mismatch - ignore + return; + } + } + } catch (e) { + // ignore origin validation failures and proceed only if source matched + } + const payload = { url: data.url, tag: data.tag, @@ -220,7 +241,7 @@ export const DOMBrowserRenderer: React.FC = ({ }; window.addEventListener('message', handler); return () => window.removeEventListener('message', handler); - }, [socket]); + }, [socket, iframeRef, snapshot]); useEffect(() => { if (listSelector) { From 976884473a5098fd133fbef815a0b4fa5713032a Mon Sep 17 00:00:00 2001 From: Aman Raj Date: Sun, 12 Oct 2025 21:41:30 +0530 Subject: [PATCH 7/7] more fixes for the image parser --- .../recorder/DOMBrowserRenderer.tsx | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/components/recorder/DOMBrowserRenderer.tsx b/src/components/recorder/DOMBrowserRenderer.tsx index 9c576a1bd..f9770b492 100644 --- a/src/components/recorder/DOMBrowserRenderer.tsx +++ b/src/components/recorder/DOMBrowserRenderer.tsx @@ -210,20 +210,42 @@ export const DOMBrowserRenderer: React.FC = ({ const iframeWindow = iframeRef.current?.contentWindow || null; if (ev.source !== iframeWindow) { // Not from the recorded iframe - ignore + // console.debug('Dropped media-extracted: source mismatch'); return; } - // If snapshot.baseUrl is available, validate origin when possible - try { - if (snapshot?.baseUrl) { + // Require a non-null origin for messages + if (!ev.origin || ev.origin === 'null') { + // console.debug('Dropped media-extracted: null origin'); + return; + } + + // If snapshot.baseUrl is available, validate origin and also verify data.url origin + if (snapshot?.baseUrl) { + try { const expectedOrigin = new URL(snapshot.baseUrl).origin; - if (ev.origin && ev.origin !== 'null' && ev.origin !== expectedOrigin) { + if (ev.origin !== expectedOrigin) { // origin mismatch - ignore + // console.debug('Dropped media-extracted: origin mismatch', ev.origin, expectedOrigin); return; } + + // Validate that the reported data.url has the same origin + try { + const reportedOrigin = new URL(data.url).origin; + if (reportedOrigin !== expectedOrigin) { + // reported url is not from the recorded page origin + // console.debug('Dropped media-extracted: data.url origin mismatch', reportedOrigin, expectedOrigin); + return; + } + } catch (e) { + // invalid data.url - drop + return; + } + } catch (e) { + // If snapshot.baseUrl parsing failed, drop the message + return; } - } catch (e) { - // ignore origin validation failures and proceed only if source matched } const payload = {