Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 124 additions & 1 deletion maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -1510,4 +1510,127 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return results;
};

})(window);
})(window);

/* Media extraction support - listens for clicks on media elements and extracts text.
Sends { url, tag, selector, extractedText } via postMessage to the parent window. */

// Extract text from image: alt/title first, then OCR via Tesseract if available
async function extractImageText(img) {
try {
const altTitle = (img.alt || img.title || '').trim();
if (altTitle) return altTitle;

if (window.Tesseract && typeof window.Tesseract.recognize === 'function') {
// Use the image src (may be data: or remote); ignore if data: that contains large chunks
const src = img.currentSrc || img.src || '';
if (!src) return '';
try {
const result = await window.Tesseract.recognize(src, 'eng');
return (result?.data?.text || '').trim();
} catch (e) {
return '';
}
}
} catch (e) {
return '';
}
return '';
}

// Extract text from PDF using pdf.js if available
async function extractPdfText(url) {
try {
if (!window.pdfjsLib) return '';
const loadingTask = window.pdfjsLib.getDocument(url);
const pdf = await loadingTask.promise;
let text = '';
for (let i = 1; i <= pdf.numPages; i++) {
// eslint-disable-next-line no-await-in-loop
const page = await pdf.getPage(i);
// eslint-disable-next-line no-await-in-loop
const content = await page.getTextContent();
text += content.items.map((it) => it.str).join(' ') + '\n';
}
return text.trim();
} catch (e) {
return '';
}
}

// Helper to generate structural selector if function is available
function structuralSelector(el) {
try {
if (typeof GetSelectorStructural === 'function') return GetSelectorStructural(el);
} catch (e) {
// fallthrough
}
return '';
}

// Click listener for media elements
document.addEventListener('click', async (ev) => {
try {
const el = ev.target;
if (!el || !el.tagName) return;
const tag = el.tagName.toLowerCase();
let url = '';
let selector = structuralSelector(el);
let extractedText = '';

if (tag === 'img') {
url = el.currentSrc || el.src || '';
extractedText = (el.alt || el.title || '').trim();
if (!extractedText) extractedText = await extractImageText(el);
} else if (tag === 'iframe' || tag === 'embed') {
url = el.src || el.data || '';
if (url && /\.pdf(\?|$)/i.test(url)) {
extractedText = await extractPdfText(url);
}
} else if (tag === 'object') {
// <object data="...pdf"> style
url = el.data || '';
if (url && /\.pdf(\?|$)/i.test(url)) {
extractedText = await extractPdfText(url);
}
}

if (url && extractedText) {
// Post to parent so the recorder frontend (or wrapper) can relay it to server socket
try {
window.parent.postMessage({
type: 'maxun:media-extracted',
url,
tag,
selector,
extractedText
}, '*');
} catch (e) {
// ignore
}
}
} catch (e) {
// swallow
}
});

// Load Tesseract and PDF.js if not already present (CDN).
if (!window.Tesseract) {
const s = document.createElement('script');
s.src = 'https://cdn.jsdelivr.net/npm/[email protected]/dist/tesseract.min.js';
s.async = true;
document.head.appendChild(s);
}
if (!window.pdfjsLib) {
const s2 = document.createElement('script');
s2.src = 'https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.min.js';
s2.async = true;
s2.onload = () => {
try {
// eslint-disable-next-line no-undef
window.pdfjsLib = window['pdfjs-dist/build/pdf'];
if (window.pdfjsLib) window.pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.worker.min.js';
} catch (e) {}
};
document.head.appendChild(s2);
}
123 changes: 123 additions & 0 deletions mediaParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// mediaParser.js

import axios from 'axios';
import * as cheerio from 'cheerio';
import { URL } from 'url';
import logger from './logger'; // Adjust path if necessary

/**
* Fetches and extracts all images from a webpage, including responsive images.
* This includes regular <img> tags, srcset URLs, and <source> tags within <picture> elements.
*
* @param {string} url - The webpage URL to extract images from.
* Must be a valid, non-empty string.
* @returns {Array} - An array of objects:
* {
* url: string, // The absolute URL of the image
* altText: string // The alt text of the image (if any)
* }
* @throws {TypeError} - If the URL is missing or not a string.
* @throws {Error} - If the fetch fails or the response is not HTML.
*/
async function extractImages(url) {
if (!url || typeof url !== 'string') {
throw new TypeError('URL must be a non-empty string');
}

try {
// Fetch webpage with axios
const response = await axios.get(url, {
timeout: 10000,
maxContentLength: 10 * 1024 * 1024,
maxBodyLength: 10 * 1024 * 1024,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; MaxunBot/1.0; +https://maxun.dev)'
},
maxRedirects: 5
});

// Validate that content is HTML
const contentType = response.headers['content-type'] || '';
if (!contentType.includes('text/html')) {
throw new Error(`Expected HTML but got ${contentType}`);
}

const html = response.data;
const $ = cheerio.load(html, {
decodeEntities: true,
normalizeWhitespace: false
});

const images = [];
const seen = new Set();

// Extract <img> tags
$('img').each((index, element) => {
const alt = $(element).attr('alt') || '';
let src = $(element).attr('src');

if (src) {
try {
const absoluteUrl = new URL(src, url).href;
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
seen.add(absoluteUrl);
images.push({ url: absoluteUrl, altText: alt });
}
} catch {
logger.warn(`Invalid image URL: ${src}`);
}
}

// Handle srcset (responsive images)
const srcset = $(element).attr('srcset');
if (srcset) {
const srcsetUrls = srcset.split(',')
.map(s => s.trim().split(/\s+/)[0])
.filter(Boolean);

for (const srcsetUrl of srcsetUrls) {
try {
const absoluteUrl = new URL(srcsetUrl, url).href;
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
seen.add(absoluteUrl);
images.push({ url: absoluteUrl, altText: alt });
}
} catch {
logger.warn(`Invalid srcset URL: ${srcsetUrl}`);
}
}
}
});

// Extract <source> tags inside <picture> elements
$('picture source').each((i, element) => {
const srcset = $(element).attr('srcset');
if (srcset) {
const srcsetUrls = srcset.split(',')
.map(s => s.trim().split(/\s+/)[0])
.filter(Boolean);

for (const srcsetUrl of srcsetUrls) {
try {
const absoluteUrl = new URL(srcsetUrl, url).href;
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
seen.add(absoluteUrl);
images.push({ url: absoluteUrl, altText: '' });
}
} catch {
logger.warn(`Invalid srcset URL in <source>: ${srcsetUrl}`);
}
}
}
});

return images;

} catch (error) {
// Preserve original stack trace
throw new Error(`Failed to extract images from ${url}`, { cause: error });
}
}

// Export function for other modules
export { extractImages };
5 changes: 4 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
"@types/react-dom": "^18.0.1",
"@types/uuid": "^8.3.4",
"airtable": "^0.12.2",
"axios": "^1.9.0",
"axios": "^1.12.2",
"bcrypt": "^5.1.1",
"body-parser": "^1.20.3",
"buffer": "^6.0.3",
"cheerio": "^1.1.2",
"connect-pg-simple": "^10.0.0",
"cookie-parser": "^1.4.6",
"cors": "^2.8.5",
Expand Down Expand Up @@ -55,6 +56,7 @@
"minio": "^8.0.1",
"moment-timezone": "^0.5.45",
"node-cron": "^3.0.3",
"pdfjs-dist": "^5.4.296",
"pg": "^8.13.0",
"pg-boss": "^10.1.6",
"pkce-challenge": "^4.1.0",
Expand All @@ -80,6 +82,7 @@
"styled-components": "^5.3.3",
"swagger-jsdoc": "^6.2.8",
"swagger-ui-express": "^5.0.1",
"tesseract.js": "^6.0.1",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Unused dependency: tesseract.js is not referenced in the current implementation.

The tesseract.js library was added but is not imported or used in mediaParser.js. Consider removing it unless it's planned for future use (e.g., OCR on images).

If tesseract.js is intended for future functionality, consider adding a comment in the code or creating a follow-up issue to track its implementation.

🤖 Prompt for AI Agents
In package.json around line 84, the dependency "tesseract.js": "^6.0.1" is
unused in the codebase (not imported in mediaParser.js); remove the tesseract.js
entry from package.json and run npm/yarn install to update lockfile, or if it is
intended for future OCR work, leave the dependency but add a short TODO comment
in the relevant module or create a follow-up issue referencing this dependency
and its planned usage so it is tracked.

"typedoc": "^0.23.8",
"typescript": "^4.6.3",
"uuid": "^8.3.2",
Expand Down
29 changes: 29 additions & 0 deletions server/src/browser-management/inputHandlers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,34 @@ const handleGoForward = async (activeBrowser: RemoteBrowser, page: Page) => {
}
};

/**
* Handle media extracted event forwarded from client (via postMessage relay).
* data: { url, tag, selector, extractedText }
*/
const onMediaExtracted = async (data: { url: string; tag: string; selector: string; extractedText: string }, userId: string) => {
logger.log('debug', 'Handling media-extracted event emitted from client');
await handleWrapper(handleMediaExtracted, userId, data);
}

const handleMediaExtracted = async (activeBrowser: RemoteBrowser, page: Page, data: { url: string; tag: string; selector: string; extractedText: string }) => {
try {
if (page.isClosed()) {
logger.log("debug", `Ignoring media-extracted event: page is closed`);
return;
}
const generator = activeBrowser.generator;
if (generator && typeof generator.handleMediaExtracted === 'function') {
await generator.handleMediaExtracted(data, page);
} else {
logger.log('warn', 'Generator does not implement handleMediaExtracted');
}
logger.log('debug', `Media extracted added: ${data.url}`);
} catch (e) {
const { message } = e as Error;
logger.log('warn', `Error handling media-extracted event: ${message}`);
}
}

/**
* Handles the click action event.
* @param activeBrowser - the active remote browser {@link RemoteBrowser}
Expand Down Expand Up @@ -851,6 +879,7 @@ const registerInputHandlers = (socket: Socket, userId: string) => {
socket.on("dom:click", (data) => onDOMClickAction(data, userId));
socket.on("dom:keypress", (data) => onDOMKeyboardAction(data, userId));
socket.on("dom:addpair", (data) => onDOMWorkflowPair(data, userId));
socket.on("dom:media-extracted", (data) => onMediaExtracted(data, userId));
};

export default registerInputHandlers;
30 changes: 30 additions & 0 deletions server/src/workflow-management/classes/Generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,36 @@ export class WorkflowGenerator {
});
}

/**
* Handle media extraction event from browser-side snippet.
* Appends a media event object to workflowRecord.events.
*/
public async handleMediaExtracted(data: { url: string; tag: string; selector: string; extractedText: string }, page: Page) {
try {
if (!this.workflowRecord) this.workflowRecord = { workflow: [] } as WorkflowFile;
// Ensure events array exists on workflowRecord (non-standard addition)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(this.workflowRecord as any).events = (this.workflowRecord as any).events || [];
(this.workflowRecord as any).events.push({
type: 'media',
url: data.url,
tag: data.tag,
selector: data.selector,
extractedText: data.extractedText,
timestamp: Date.now(),
});

// notify client of new event if needed
try {
this.socket.emit('workflow:media-added', { url: data.url, selector: data.selector });
} catch (e) {
// ignore
}
} catch (e) {
logger.log('warn', `handleMediaExtracted failed: ${(e as Error).message}`);
}
}

/**
* Registers the event handlers for all generator-related events on the socket.
* @param socket The socket used to communicate with the client.
Expand Down
Loading