-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Feat: Add image extraction for webpages (media parsing) #818
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Aman-Raj-bat
wants to merge
7
commits into
getmaxun:develop
Choose a base branch
from
Aman-Raj-bat:feat/image-parsing
base: develop
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
3a102be
Feat: Add image extraction for webpages (media parsing)
Aman-Raj-bat df2efee
Refactor: Improve image extraction with ES6, validation, srcset, and …
Aman-Raj-bat caaf131
Refactor: Improve image extraction
Aman-Raj-bat d24496c
Refactor: Enhance image extraction
Aman-Raj-bat 6624bba
feat(issue #164): integrate media extraction into recorder flow
Aman-Raj-bat 3a68e60
feat(issue #164): integrate media extraction into recorder flow
Aman-Raj-bat 9768844
more fixes for the image parser
Aman-Raj-bat File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1510,4 +1510,127 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, | |
return results; | ||
}; | ||
|
||
})(window); | ||
})(window); | ||
|
||
/* Media extraction support - listens for clicks on media elements and extracts text. | ||
Sends { url, tag, selector, extractedText } via postMessage to the parent window. */ | ||
|
||
// Extract text from image: alt/title first, then OCR via Tesseract if available | ||
async function extractImageText(img) { | ||
try { | ||
const altTitle = (img.alt || img.title || '').trim(); | ||
if (altTitle) return altTitle; | ||
|
||
if (window.Tesseract && typeof window.Tesseract.recognize === 'function') { | ||
// Use the image src (may be data: or remote); ignore if data: that contains large chunks | ||
const src = img.currentSrc || img.src || ''; | ||
if (!src) return ''; | ||
try { | ||
const result = await window.Tesseract.recognize(src, 'eng'); | ||
return (result?.data?.text || '').trim(); | ||
} catch (e) { | ||
return ''; | ||
} | ||
} | ||
} catch (e) { | ||
return ''; | ||
} | ||
return ''; | ||
} | ||
|
||
// Extract text from PDF using pdf.js if available | ||
async function extractPdfText(url) { | ||
try { | ||
if (!window.pdfjsLib) return ''; | ||
const loadingTask = window.pdfjsLib.getDocument(url); | ||
const pdf = await loadingTask.promise; | ||
let text = ''; | ||
for (let i = 1; i <= pdf.numPages; i++) { | ||
// eslint-disable-next-line no-await-in-loop | ||
const page = await pdf.getPage(i); | ||
// eslint-disable-next-line no-await-in-loop | ||
const content = await page.getTextContent(); | ||
text += content.items.map((it) => it.str).join(' ') + '\n'; | ||
} | ||
return text.trim(); | ||
} catch (e) { | ||
return ''; | ||
} | ||
} | ||
|
||
// Helper to generate structural selector if function is available | ||
function structuralSelector(el) { | ||
try { | ||
if (typeof GetSelectorStructural === 'function') return GetSelectorStructural(el); | ||
} catch (e) { | ||
// fallthrough | ||
} | ||
return ''; | ||
} | ||
|
||
// Click listener for media elements | ||
document.addEventListener('click', async (ev) => { | ||
try { | ||
const el = ev.target; | ||
if (!el || !el.tagName) return; | ||
const tag = el.tagName.toLowerCase(); | ||
let url = ''; | ||
let selector = structuralSelector(el); | ||
let extractedText = ''; | ||
|
||
if (tag === 'img') { | ||
url = el.currentSrc || el.src || ''; | ||
extractedText = (el.alt || el.title || '').trim(); | ||
if (!extractedText) extractedText = await extractImageText(el); | ||
} else if (tag === 'iframe' || tag === 'embed') { | ||
url = el.src || el.data || ''; | ||
if (url && /\.pdf(\?|$)/i.test(url)) { | ||
extractedText = await extractPdfText(url); | ||
} | ||
} else if (tag === 'object') { | ||
// <object data="...pdf"> style | ||
url = el.data || ''; | ||
if (url && /\.pdf(\?|$)/i.test(url)) { | ||
extractedText = await extractPdfText(url); | ||
} | ||
} | ||
|
||
if (url && extractedText) { | ||
// Post to parent so the recorder frontend (or wrapper) can relay it to server socket | ||
try { | ||
window.parent.postMessage({ | ||
type: 'maxun:media-extracted', | ||
url, | ||
tag, | ||
selector, | ||
extractedText | ||
}, '*'); | ||
} catch (e) { | ||
// ignore | ||
} | ||
} | ||
} catch (e) { | ||
// swallow | ||
} | ||
}); | ||
|
||
// Load Tesseract and PDF.js if not already present (CDN). | ||
if (!window.Tesseract) { | ||
const s = document.createElement('script'); | ||
s.src = 'https://cdn.jsdelivr.net/npm/[email protected]/dist/tesseract.min.js'; | ||
s.async = true; | ||
document.head.appendChild(s); | ||
} | ||
if (!window.pdfjsLib) { | ||
const s2 = document.createElement('script'); | ||
s2.src = 'https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.min.js'; | ||
s2.async = true; | ||
s2.onload = () => { | ||
try { | ||
// eslint-disable-next-line no-undef | ||
window.pdfjsLib = window['pdfjs-dist/build/pdf']; | ||
if (window.pdfjsLib) window.pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/[email protected]/build/pdf.worker.min.js'; | ||
} catch (e) {} | ||
}; | ||
document.head.appendChild(s2); | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
// mediaParser.js | ||
|
||
import axios from 'axios'; | ||
import * as cheerio from 'cheerio'; | ||
import { URL } from 'url'; | ||
import logger from './logger'; // Adjust path if necessary | ||
|
||
/** | ||
* Fetches and extracts all images from a webpage, including responsive images. | ||
* This includes regular <img> tags, srcset URLs, and <source> tags within <picture> elements. | ||
* | ||
* @param {string} url - The webpage URL to extract images from. | ||
* Must be a valid, non-empty string. | ||
* @returns {Array} - An array of objects: | ||
* { | ||
* url: string, // The absolute URL of the image | ||
* altText: string // The alt text of the image (if any) | ||
* } | ||
* @throws {TypeError} - If the URL is missing or not a string. | ||
* @throws {Error} - If the fetch fails or the response is not HTML. | ||
*/ | ||
async function extractImages(url) { | ||
if (!url || typeof url !== 'string') { | ||
throw new TypeError('URL must be a non-empty string'); | ||
} | ||
|
||
try { | ||
// Fetch webpage with axios | ||
const response = await axios.get(url, { | ||
timeout: 10000, | ||
maxContentLength: 10 * 1024 * 1024, | ||
maxBodyLength: 10 * 1024 * 1024, | ||
headers: { | ||
'User-Agent': 'Mozilla/5.0 (compatible; MaxunBot/1.0; +https://maxun.dev)' | ||
}, | ||
maxRedirects: 5 | ||
}); | ||
|
||
// Validate that content is HTML | ||
const contentType = response.headers['content-type'] || ''; | ||
if (!contentType.includes('text/html')) { | ||
throw new Error(`Expected HTML but got ${contentType}`); | ||
} | ||
|
||
const html = response.data; | ||
const $ = cheerio.load(html, { | ||
decodeEntities: true, | ||
normalizeWhitespace: false | ||
}); | ||
|
||
const images = []; | ||
const seen = new Set(); | ||
|
||
// Extract <img> tags | ||
$('img').each((index, element) => { | ||
const alt = $(element).attr('alt') || ''; | ||
let src = $(element).attr('src'); | ||
|
||
if (src) { | ||
try { | ||
const absoluteUrl = new URL(src, url).href; | ||
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { | ||
seen.add(absoluteUrl); | ||
images.push({ url: absoluteUrl, altText: alt }); | ||
} | ||
} catch { | ||
logger.warn(`Invalid image URL: ${src}`); | ||
} | ||
} | ||
|
||
// Handle srcset (responsive images) | ||
const srcset = $(element).attr('srcset'); | ||
if (srcset) { | ||
const srcsetUrls = srcset.split(',') | ||
.map(s => s.trim().split(/\s+/)[0]) | ||
.filter(Boolean); | ||
|
||
for (const srcsetUrl of srcsetUrls) { | ||
try { | ||
const absoluteUrl = new URL(srcsetUrl, url).href; | ||
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { | ||
seen.add(absoluteUrl); | ||
images.push({ url: absoluteUrl, altText: alt }); | ||
} | ||
} catch { | ||
logger.warn(`Invalid srcset URL: ${srcsetUrl}`); | ||
} | ||
} | ||
} | ||
}); | ||
|
||
// Extract <source> tags inside <picture> elements | ||
$('picture source').each((i, element) => { | ||
const srcset = $(element).attr('srcset'); | ||
if (srcset) { | ||
const srcsetUrls = srcset.split(',') | ||
.map(s => s.trim().split(/\s+/)[0]) | ||
.filter(Boolean); | ||
|
||
for (const srcsetUrl of srcsetUrls) { | ||
try { | ||
const absoluteUrl = new URL(srcsetUrl, url).href; | ||
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { | ||
seen.add(absoluteUrl); | ||
images.push({ url: absoluteUrl, altText: '' }); | ||
} | ||
} catch { | ||
logger.warn(`Invalid srcset URL in <source>: ${srcsetUrl}`); | ||
} | ||
} | ||
} | ||
}); | ||
|
||
return images; | ||
|
||
} catch (error) { | ||
// Preserve original stack trace | ||
throw new Error(`Failed to extract images from ${url}`, { cause: error }); | ||
} | ||
} | ||
|
||
// Export function for other modules | ||
export { extractImages }; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unused dependency: tesseract.js is not referenced in the current implementation.
The tesseract.js library was added but is not imported or used in mediaParser.js. Consider removing it unless it's planned for future use (e.g., OCR on images).
If tesseract.js is intended for future functionality, consider adding a comment in the code or creating a follow-up issue to track its implementation.
🤖 Prompt for AI Agents