Skip to content

Commit 5058a3b

Browse files
authored
Merge pull request #320 from getmaxun/shadow-iframe
feat: compatibility check for shadowDOM and iframe scraping
2 parents c132f27 + 9c8a980 commit 5058a3b

File tree

2 files changed

+70
-56
lines changed

2 files changed

+70
-56
lines changed

maxun-core/src/browserSide/scraper.js

Lines changed: 68 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -207,69 +207,82 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
207207
function findAllElements(config) {
208208
// Regular DOM query if no special delimiters
209209
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
210-
return Array.from(document.querySelectorAll(config.selector));
210+
return Array.from(document.querySelectorAll(config.selector));
211211
}
212212

213-
// Split by both types of delimiters
214-
const parts = config.selector.split(/(?:>>|:>>)/).map(s => s.trim());
215-
const delimiters = config.selector.match(/(?:>>|:>>)/g) || [];
216-
let currentElements = [document];
217-
218-
for (let i = 0; i < parts.length; i++) {
219-
const part = parts[i];
220-
const nextElements = [];
221-
const isLast = i === parts.length - 1;
222-
const delimiter = delimiters[i] || '';
223-
const isIframeTraversal = delimiter === ':>>';
224-
225-
for (const element of currentElements) {
226-
try {
227-
let targets;
228-
229-
if (i === 0) {
230-
// First selector is queried from main document
231-
targets = Array.from(element.querySelectorAll(part))
232-
.filter(el => {
233-
if (isLast) return true;
234-
// For iframe traversal, only include iframes
235-
if (isIframeTraversal) return el.tagName === 'IFRAME';
236-
// For shadow DOM traversal, only include elements with shadow root
237-
return el.shadowRoot && el.shadowRoot.mode === 'open';
238-
});
239-
} else {
240-
if (isIframeTraversal) {
241-
// Handle iframe traversal
242-
const iframeDocument = element.contentDocument || element.contentWindow?.document;
243-
if (!iframeDocument) continue;
244-
245-
targets = Array.from(iframeDocument.querySelectorAll(part));
246-
if (!isLast) {
247-
targets = targets.filter(el => el.tagName === 'IFRAME');
213+
// First handle iframe traversal if present
214+
if (config.selector.includes(':>>')) {
215+
const parts = config.selector.split(':>>').map(s => s.trim());
216+
let currentElements = [document];
217+
218+
// Traverse through each part of the selector
219+
for (let i = 0; i < parts.length; i++) {
220+
const part = parts[i];
221+
const nextElements = [];
222+
const isLast = i === parts.length - 1;
223+
224+
for (const element of currentElements) {
225+
try {
226+
// For document or iframe document
227+
const doc = element.contentDocument || element || element.contentWindow?.document;
228+
if (!doc) continue;
229+
230+
// Query elements in current context
231+
const found = Array.from(doc.querySelectorAll(part));
232+
233+
if (isLast) {
234+
// If it's the last part, keep all matching elements
235+
nextElements.push(...found);
236+
} else {
237+
// If not last, only keep iframes for next iteration
238+
const iframes = found.filter(el => el.tagName === 'IFRAME');
239+
nextElements.push(...iframes);
240+
}
241+
} catch (error) {
242+
console.warn('Cannot access iframe content:', error, {
243+
part,
244+
element,
245+
index: i
246+
});
248247
}
249-
} else {
250-
// Handle shadow DOM traversal
251-
const shadowRoot = element.shadowRoot;
252-
if (!shadowRoot || shadowRoot.mode !== 'open') continue;
253-
254-
targets = Array.from(shadowRoot.querySelectorAll(part));
255-
if (!isLast) {
256-
targets = targets.filter(el => el.shadowRoot && el.shadowRoot.mode === 'open');
257-
}
258-
}
259248
}
260-
261-
nextElements.push(...targets);
262-
} catch (error) {
263-
console.warn('Cannot access content:', error);
264-
continue;
265-
}
249+
250+
if (nextElements.length === 0) {
251+
console.warn('No elements found for part:', part, 'at depth:', i);
252+
return [];
253+
}
254+
currentElements = nextElements;
266255
}
256+
257+
return currentElements;
258+
}
259+
260+
// Handle shadow DOM traversal
261+
if (config.selector.includes('>>')) {
262+
const parts = config.selector.split('>>').map(s => s.trim());
263+
let currentElements = [document];
267264

268-
if (nextElements.length === 0) return [];
269-
currentElements = nextElements;
265+
for (const part of parts) {
266+
const nextElements = [];
267+
for (const element of currentElements) {
268+
// Try regular DOM first
269+
const found = Array.from(element.querySelectorAll(part));
270+
271+
// Then check shadow roots
272+
for (const foundEl of found) {
273+
if (foundEl.shadowRoot) {
274+
nextElements.push(foundEl.shadowRoot);
275+
} else {
276+
nextElements.push(foundEl);
277+
}
278+
}
279+
}
280+
currentElements = nextElements;
281+
}
282+
return currentElements.filter(el => !(el instanceof ShadowRoot));
270283
}
271284

272-
return currentElements;
285+
return [];
273286
}
274287

275288
// Modified to handle iframe context for URL resolution

server/src/workflow-management/utils.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ export const getBestSelectorForAction = (action: Action) => {
1616

1717
if (selectors?.iframeSelector?.full) {
1818
return selectors.iframeSelector.full;
19-
19+
}
20+
2021
if (selectors?.shadowSelector?.full) {
2122
return selectors.shadowSelector.full;
2223
}

0 commit comments

Comments
 (0)