diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 7ee0f812d..9321693bf 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -423,50 +423,155 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - // Enhanced query function to handle iframe, frame and shadow DOM + // XPath evaluation functions + const evaluateXPath = (rootElement, xpath) => { + try { + const ownerDoc = + rootElement.nodeType === Node.DOCUMENT_NODE + ? rootElement + : rootElement.ownerDocument; + + if (!ownerDoc) return null; + + const result = ownerDoc.evaluate( + xpath, + rootElement, + null, + XPathResult.FIRST_ORDERED_NODE_TYPE, + null + ); + + return result.singleNodeValue; + } catch (error) { + console.warn("XPath evaluation failed:", xpath, error); + return null; + } + }; + + const evaluateXPathAll = (rootElement, xpath) => { + try { + const ownerDoc = + rootElement.nodeType === Node.DOCUMENT_NODE + ? rootElement + : rootElement.ownerDocument; + + if (!ownerDoc) return []; + + const result = ownerDoc.evaluate( + xpath, + rootElement, + null, + XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, + null + ); + + const elements = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node); + } + } + + return elements; + } catch (error) { + console.warn("XPath evaluation failed:", xpath, error); + return []; + } + }; + + // Helper function to detect selector type + const isXPathSelector = (selector) => { + return ( + selector.startsWith("//") || + selector.startsWith("/") || + selector.startsWith("./") + ); + }; + + // Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath const queryElement = (rootElement, selector) => { - if (!selector.includes('>>') && !selector.includes(':>>')) { + if (!selector.includes(">>") && !selector.includes(":>>")) { + // Check if it's an XPath selector + if (isXPathSelector(selector)) { + return evaluateXPath(rootElement, selector); + } else { return rootElement.querySelector(selector); + } } - const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim()); + const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); let currentElement = rootElement; for (let i = 0; i < parts.length; i++) { - if (!currentElement) return null; + if (!currentElement) return null; - // Handle iframe and frame traversal - if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') { - try { - const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document; - currentElement = frameDoc.querySelector(parts[i]); - continue; - } catch (e) { - console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e); - return null; - } + // Handle iframe and frame traversal + if ( + currentElement.tagName === "IFRAME" || + currentElement.tagName === "FRAME" + ) { + try { + const frameDoc = + currentElement.contentDocument || + currentElement.contentWindow.document; + if (!frameDoc) return null; + + if (isXPathSelector(parts[i])) { + currentElement = evaluateXPath(frameDoc, parts[i]); + } else { + currentElement = frameDoc.querySelector(parts[i]); + } + continue; + } catch (e) { + console.warn( + `Cannot access ${currentElement.tagName.toLowerCase()} content:`, + e + ); + return null; } + } - // Try regular DOM first - let nextElement = currentElement.querySelector(parts[i]); + let nextElement = null; - // Try shadow DOM if not found - if (!nextElement && currentElement.shadowRoot) { - nextElement = currentElement.shadowRoot.querySelector(parts[i]); + // Try regular DOM first + if ("querySelector" in currentElement) { + if (isXPathSelector(parts[i])) { + nextElement = evaluateXPath(currentElement, parts[i]); + } else { + nextElement = currentElement.querySelector(parts[i]); } + } - // Check children's shadow roots if still not found - if (!nextElement) { - const children = Array.from(currentElement.children || []); - for (const child of children) { - if (child.shadowRoot) { - nextElement = child.shadowRoot.querySelector(parts[i]); - if (nextElement) break; - } + // Try shadow DOM if not found + if ( + !nextElement && + "shadowRoot" in currentElement && + currentElement.shadowRoot + ) { + if (isXPathSelector(parts[i])) { + nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]); + } else { + nextElement = currentElement.shadowRoot.querySelector(parts[i]); + } + } + + // Check children's shadow roots if still not found + if (!nextElement && "children" in currentElement) { + const children = Array.from(currentElement.children || []); + for (const child of children) { + if (child.shadowRoot) { + if (isXPathSelector(parts[i])) { + nextElement = evaluateXPath(child.shadowRoot, parts[i]); + } else { + nextElement = child.shadowRoot.querySelector(parts[i]); } + if (nextElement) break; + } } + } - currentElement = nextElement; + currentElement = nextElement; } return currentElement; @@ -474,322 +579,492 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, // Enhanced query all function for both contexts const queryElementAll = (rootElement, selector) => { - if (!selector.includes('>>') && !selector.includes(':>>')) { - return rootElement.querySelectorAll(selector); + if (!selector.includes(">>") && !selector.includes(":>>")) { + if (isXPathSelector(selector)) { + return evaluateXPathAll(rootElement, selector); + } else { + return Array.from(rootElement.querySelectorAll(selector)); + } } - const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim()); + const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); let currentElements = [rootElement]; for (const part of parts) { - const nextElements = []; - - for (const element of currentElements) { - // Handle iframe and frame traversal - if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') { - try { - const frameDoc = element.contentDocument || element.contentWindow.document; - nextElements.push(...frameDoc.querySelectorAll(part)); - } catch (e) { - console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e); - continue; - } + const nextElements = []; + + for (const element of currentElements) { + // Handle iframe and frame traversal + if (element.tagName === "IFRAME" || element.tagName === "FRAME") { + try { + const frameDoc = + element.contentDocument || element.contentWindow.document; + if (frameDoc) { + if (isXPathSelector(part)) { + nextElements.push(...evaluateXPathAll(frameDoc, part)); + } else { + nextElements.push( + ...Array.from(frameDoc.querySelectorAll(part)) + ); + } + } + } catch (e) { + console.warn( + `Cannot access ${element.tagName.toLowerCase()} content:`, + e + ); + continue; + } + } else { + // Regular DOM elements + if (element.querySelectorAll) { + if (isXPathSelector(part)) { + nextElements.push(...evaluateXPathAll(element, part)); } else { - // Regular DOM elements - if (element.querySelectorAll) { - nextElements.push(...element.querySelectorAll(part)); - } - - // Shadow DOM elements - if (element.shadowRoot) { - nextElements.push(...element.shadowRoot.querySelectorAll(part)); - } - - // Check children's shadow roots - const children = Array.from(element.children || []); - for (const child of children) { - if (child.shadowRoot) { - nextElements.push(...child.shadowRoot.querySelectorAll(part)); - } - } + nextElements.push( + ...Array.from(element.querySelectorAll(part)) + ); } + } + + // Shadow DOM elements + if (element.shadowRoot) { + if (isXPathSelector(part)) { + nextElements.push( + ...evaluateXPathAll(element.shadowRoot, part) + ); + } else { + nextElements.push( + ...Array.from(element.shadowRoot.querySelectorAll(part)) + ); + } + } + + // Check children's shadow roots + const children = Array.from(element.children || []); + for (const child of children) { + if (child.shadowRoot) { + if (isXPathSelector(part)) { + nextElements.push( + ...evaluateXPathAll(child.shadowRoot, part) + ); + } else { + nextElements.push( + ...Array.from(child.shadowRoot.querySelectorAll(part)) + ); + } + } + } } + } - currentElements = nextElements; + currentElements = nextElements; } return currentElements; }; // Enhanced value extraction with context awareness - function extractValue(element, attribute) { - if (!element) return null; - - // Get context-aware base URL - const baseURL = element.ownerDocument?.location?.href || window.location.origin; - - // Check shadow root first - if (element.shadowRoot) { - const shadowContent = element.shadowRoot.textContent; - if (shadowContent?.trim()) { - return shadowContent.trim(); - } + const extractValue = (element, attribute) => { + if (!element) return null; + + // Get context-aware base URL + const baseURL = + element.ownerDocument?.location?.href || window.location.origin; + + // Check shadow root first + if (element.shadowRoot) { + const shadowContent = element.shadowRoot.textContent; + if (shadowContent?.trim()) { + return shadowContent.trim(); } - - if (attribute === 'innerText') { - return element.innerText.trim(); - } else if (attribute === 'innerHTML') { - return element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - if (attribute === 'href' && element.tagName !== 'A') { - const parentElement = element.parentElement; - if (parentElement && parentElement.tagName === 'A') { - const parentHref = parentElement.getAttribute('href'); - if (parentHref) { - try { - return new URL(parentHref, baseURL).href; - } catch (e) { - return parentHref; - } - } - } + } + + if (attribute === "innerText") { + // First try standard innerText/textContent + let textContent = + element.innerText?.trim() || element.textContent?.trim(); + + // If empty, check for common data attributes that might contain the text + if (!textContent) { + const dataAttributes = [ + "data-600", + "data-text", + "data-label", + "data-value", + "data-content", + ]; + for (const attr of dataAttributes) { + const dataValue = element.getAttribute(attr); + if (dataValue && dataValue.trim()) { + textContent = dataValue.trim(); + break; } - - const attrValue = element.getAttribute(attribute); - const dataAttr = attrValue || element.getAttribute('data-' + attribute); - - if (!dataAttr || dataAttr.trim() === '') { - if (attribute === 'src') { - const style = window.getComputedStyle(element); - const bgImage = style.backgroundImage; - if (bgImage && bgImage !== 'none') { - const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); - return matches ? new URL(matches[1], baseURL).href : null; - } - } - return null; + } + } + + return textContent || null; + } else if (attribute === "innerHTML") { + return element.innerHTML?.trim() || null; + } else if (attribute === "src" || attribute === "href") { + if (attribute === "href" && element.tagName !== "A") { + const parentElement = element.parentElement; + if (parentElement && parentElement.tagName === "A") { + const parentHref = parentElement.getAttribute("href"); + if (parentHref) { + try { + return new URL(parentHref, baseURL).href; + } catch (e) { + return parentHref; + } } - - try { - return new URL(dataAttr, baseURL).href; - } catch (e) { - console.warn('Error creating URL from', dataAttr, e); - return dataAttr; // Return the original value if URL construction fails + } + } + + const attrValue = element.getAttribute(attribute); + const dataAttr = attrValue || element.getAttribute("data-" + attribute); + + if (!dataAttr || dataAttr.trim() === "") { + if (attribute === "src") { + const style = window.getComputedStyle(element); + const bgImage = style.backgroundImage; + if (bgImage && bgImage !== "none") { + const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); + return matches ? new URL(matches[1], baseURL).href : null; } + } + return null; + } + + try { + return new URL(dataAttr, baseURL).href; + } catch (e) { + console.warn("Error creating URL from", dataAttr, e); + return dataAttr; } - return element.getAttribute(attribute); } + return element.getAttribute(attribute); + }; // Enhanced table ancestor finding with context support - function findTableAncestor(element) { + const findTableAncestor = (element) => { let currentElement = element; const MAX_DEPTH = 5; let depth = 0; - + while (currentElement && depth < MAX_DEPTH) { - // Handle shadow DOM - if (currentElement.getRootNode() instanceof ShadowRoot) { - currentElement = currentElement.getRootNode().host; - continue; - } - - if (currentElement.tagName === 'TD') { - return { type: 'TD', element: currentElement }; - } else if (currentElement.tagName === 'TR') { - return { type: 'TR', element: currentElement }; - } - - // Handle iframe and frame crossing - if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') { - try { - currentElement = currentElement.contentDocument.body; - } catch (e) { - return null; - } - } else { - currentElement = currentElement.parentElement; + // Handle shadow DOM + if (currentElement.getRootNode() instanceof ShadowRoot) { + currentElement = currentElement.getRootNode().host; + continue; + } + + if (currentElement.tagName === "TD") { + return { type: "TD", element: currentElement }; + } else if (currentElement.tagName === "TR") { + return { type: "TR", element: currentElement }; + } + + // Handle iframe and frame crossing + if ( + currentElement.tagName === "IFRAME" || + currentElement.tagName === "FRAME" + ) { + try { + currentElement = currentElement.contentDocument.body; + } catch (e) { + return null; } - depth++; + } else { + currentElement = currentElement.parentElement; + } + depth++; } return null; - } + }; // Helper function to get cell index - function getCellIndex(td) { + const getCellIndex = (td) => { if (td.getRootNode() instanceof ShadowRoot) { - const shadowRoot = td.getRootNode(); - const allCells = Array.from(shadowRoot.querySelectorAll('td')); - return allCells.indexOf(td); + const shadowRoot = td.getRootNode(); + const allCells = Array.from(shadowRoot.querySelectorAll("td")); + return allCells.indexOf(td); } - + let index = 0; let sibling = td; - while (sibling = sibling.previousElementSibling) { - index++; + while ((sibling = sibling.previousElementSibling)) { + index++; } return index; - } + }; // Helper function to check for TH elements - function hasThElement(row, tableFields) { + const hasThElement = (row, tableFields) => { for (const [_, { selector }] of Object.entries(tableFields)) { - const element = queryElement(row, selector); - if (element) { - let current = element; - while (current && current !== row) { - if (current.getRootNode() instanceof ShadowRoot) { - current = current.getRootNode().host; - continue; - } - - if (current.tagName === 'TH') return true; - - if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') { - try { - current = current.contentDocument.body; - } catch (e) { - break; - } - } else { - current = current.parentElement; - } + const element = queryElement(row, selector); + if (element) { + let current = element; + while (current && current !== row) { + if (current.getRootNode() instanceof ShadowRoot) { + current = current.getRootNode().host; + continue; + } + + if (current.tagName === "TH") return true; + + if (current.tagName === "IFRAME" || current.tagName === "FRAME") { + try { + current = current.contentDocument.body; + } catch (e) { + break; } + } else { + current = current.parentElement; + } } + } } return false; - } + }; // Helper function to filter rows - function filterRowsBasedOnTag(rows, tableFields) { - for (const row of rows) { - if (hasThElement(row, tableFields)) { - return rows; - } + const filterRowsBasedOnTag = (rows, tableFields) => { + for (const row of rows) { + if (hasThElement(row, tableFields)) { + return rows; } - // Include shadow DOM in TH search - return rows.filter(row => { - const directTH = row.getElementsByTagName('TH').length === 0; - const shadowTH = row.shadowRoot ? - row.shadowRoot.querySelector('th') === null : true; - return directTH && shadowTH; - }); - } + } + return rows.filter((row) => { + const directTH = row.getElementsByTagName("TH").length === 0; + const shadowTH = row.shadowRoot + ? row.shadowRoot.querySelector("th") === null + : true; + return directTH && shadowTH; + }); + }; // Class similarity comparison functions - function calculateClassSimilarity(classList1, classList2) { - const set1 = new Set(classList1); - const set2 = new Set(classList2); - const intersection = new Set([...set1].filter(x => set2.has(x))); - const union = new Set([...set1, ...set2]); - return intersection.size / union.size; - } + const calculateClassSimilarity = (classList1, classList2) => { + const set1 = new Set(classList1); + const set2 = new Set(classList2); + const intersection = new Set([...set1].filter((x) => set2.has(x))); + const union = new Set([...set1, ...set2]); + return intersection.size / union.size; + }; // Enhanced similar elements finding with context support - function findSimilarElements(baseElement, similarityThreshold = 0.7) { + const findSimilarElements = (baseElement, similarityThreshold = 0.7) => { const baseClasses = Array.from(baseElement.classList); if (baseClasses.length === 0) return []; const allElements = []; - + // Get elements from main document allElements.push(...document.getElementsByTagName(baseElement.tagName)); - + // Get elements from shadow DOM if (baseElement.getRootNode() instanceof ShadowRoot) { - const shadowHost = baseElement.getRootNode().host; - allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName)); + const shadowHost = baseElement.getRootNode().host; + allElements.push( + ...shadowHost.getElementsByTagName(baseElement.tagName) + ); } - + // Get elements from iframes and frames const frames = [ - ...Array.from(document.getElementsByTagName('iframe')), - ...Array.from(document.getElementsByTagName('frame')) + ...Array.from(document.getElementsByTagName("iframe")), + ...Array.from(document.getElementsByTagName("frame")), ]; - + for (const frame of frames) { - try { - const frameDoc = frame.contentDocument || frame.contentWindow.document; - allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName)); - } catch (e) { - console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e); - } + try { + const frameDoc = + frame.contentDocument || frame.contentWindow.document; + allElements.push( + ...frameDoc.getElementsByTagName(baseElement.tagName) + ); + } catch (e) { + console.warn( + `Cannot access ${frame.tagName.toLowerCase()} content:`, + e + ); + } } - return allElements.filter(element => { - if (element === baseElement) return false; - const similarity = calculateClassSimilarity( - baseClasses, - Array.from(element.classList) - ); - return similarity >= similarityThreshold; + return allElements.filter((element) => { + if (element === baseElement) return false; + const similarity = calculateClassSimilarity( + baseClasses, + Array.from(element.classList) + ); + return similarity >= similarityThreshold; }); - } + }; - function tryFallbackSelector(rootElement, originalSelector) { - let element = queryElement(rootElement, originalSelector); - - if (!element && originalSelector.includes('nth-child')) { - const match = originalSelector.match(/nth-child\((\d+)\)/); - if (match) { - const position = parseInt(match[1], 10); - - for (let i = position - 1; i >= 1; i--) { - const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`); - element = queryElement(rootElement, fallbackSelector); - if (element) break; - } - - if (!element) { - const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, ''); - element = queryElement(rootElement, baseSelector); - } - } + const tryFallbackSelector = (rootElement, originalSelector) => { + let element = queryElement(rootElement, originalSelector); + + if (!element && originalSelector.includes("nth-child")) { + const match = originalSelector.match(/nth-child\((\d+)\)/); + if (match) { + const position = parseInt(match[1], 10); + + for (let i = position - 1; i >= 1; i--) { + const fallbackSelector = originalSelector.replace( + /nth-child\(\d+\)/, + `nth-child(${i})` + ); + element = queryElement(rootElement, fallbackSelector); + if (element) break; + } + + if (!element) { + const baseSelector = originalSelector.replace( + /\:nth-child\(\d+\)/, + "" + ); + element = queryElement(rootElement, baseSelector); + } } - - return element; - } + } + + return element; + }; + + // Create indexed XPath for specific container instance + const createIndexedXPath = ( + childSelector, + listSelector, + containerIndex + ) => { + // Check if the child selector contains the list selector pattern + if (childSelector.includes(listSelector.replace("//", ""))) { + // Replace the list selector part with indexed version + const listPattern = listSelector.replace("//", ""); + const indexedListSelector = `(${listSelector})[${containerIndex}]`; + + const indexedSelector = childSelector.replace( + `//${listPattern}`, + indexedListSelector + ); + + return indexedSelector; + } else { + // If pattern doesn't match, create a more generic indexed selector + return `(${listSelector})[${containerIndex}]${childSelector.replace( + "//", + "/" + )}`; + } + }; + + // Main scraping logic with unified support for both CSS and XPath + console.log("🚀 Starting unified list data extraction"); + console.log("List Selector:", listSelector); + console.log("Fields:", fields); - // Main scraping logic with context support let containers = queryElementAll(document, listSelector); containers = Array.from(containers); - if (containers.length === 0) return []; + if (containers.length === 0) { + console.warn("❌ No containers found for listSelector:", listSelector); + return []; + } - if (limit > 1 && containers.length === 1) { + console.log(`📦 Found ${containers.length} list containers`); + + // For CSS selectors, try to find similar containers if needed + if ( + !isXPathSelector(listSelector) && + limit > 1 && + containers.length === 1 + ) { const baseContainer = containers[0]; const similarContainers = findSimilarElements(baseContainer); - + if (similarContainers.length > 0) { - const newContainers = similarContainers.filter(container => - !container.matches(listSelector) - ); - containers = [...containers, ...newContainers]; + const newContainers = similarContainers.filter( + (container) => !container.matches(listSelector) + ); + containers = [...containers, ...newContainers]; } } const containerFields = containers.map(() => ({ tableFields: {}, - nonTableFields: {} + nonTableFields: {}, })); - // Classify fields + // For XPath selectors, use the new approach + if (isXPathSelector(listSelector)) { + const extractedData = []; + const containersToProcess = Math.min(containers.length, limit); + + for ( + let containerIndex = 0; + containerIndex < containersToProcess; + containerIndex++ + ) { + const record = {}; + + for (const [label, field] of Object.entries(fields)) { + let element = null; + + if (isXPathSelector(field.selector)) { + // Create indexed absolute XPath + const indexedSelector = createIndexedXPath( + field.selector, + listSelector, + containerIndex + 1 + ); + element = evaluateXPath(document, indexedSelector); + } else { + // Fallback for CSS selectors within XPath containers + const container = containers[containerIndex]; + element = queryElement(container, field.selector); + } + + if (element) { + const value = extractValue(element, field.attribute); + if (value !== null && value !== "") { + record[label] = value; + } else { + record[label] = ""; + } + } else { + record[label] = ""; + } + } + + if (Object.values(record).some((value) => value !== "")) { + extractedData.push(record); + } + } + + console.log(`📊 Total records extracted: ${extractedData.length}`); + return extractedData; + } + + // For CSS selectors, use the original table-aware approach containers.forEach((container, containerIndex) => { for (const [label, field] of Object.entries(fields)) { const sampleElement = queryElement(container, field.selector); - + if (sampleElement) { - const ancestor = findTableAncestor(sampleElement); - if (ancestor) { - containerFields[containerIndex].tableFields[label] = { - ...field, - tableContext: ancestor.type, - cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 - }; - } else { - containerFields[containerIndex].nonTableFields[label] = field; - } - } else { + const ancestor = findTableAncestor(sampleElement); + if (ancestor) { + containerFields[containerIndex].tableFields[label] = { + ...field, + tableContext: ancestor.type, + cellIndex: + ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1, + }; + } else { containerFields[containerIndex].nonTableFields[label] = field; + } + } else { + containerFields[containerIndex].nonTableFields[label] = field; } } }); @@ -798,149 +1073,192 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const nonTableData = []; // Process table data with support for iframes, frames, and shadow DOM - for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + for ( + let containerIndex = 0; + containerIndex < containers.length; + containerIndex++ + ) { const container = containers[containerIndex]; const { tableFields } = containerFields[containerIndex]; if (Object.keys(tableFields).length > 0) { - const firstField = Object.values(tableFields)[0]; - const firstElement = queryElement(container, firstField.selector); - let tableContext = firstElement; - - // Find table context including iframe, frame and shadow DOM - while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { - if (tableContext.getRootNode() instanceof ShadowRoot) { - tableContext = tableContext.getRootNode().host; - continue; - } - - if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') { - try { - tableContext = tableContext.contentDocument.body; - } catch (e) { - break; - } - } else { - tableContext = tableContext.parentElement; - } + const firstField = Object.values(tableFields)[0]; + const firstElement = queryElement(container, firstField.selector); + let tableContext = firstElement; + + // Find table context including iframe, frame and shadow DOM + while ( + tableContext && + tableContext.tagName !== "TABLE" && + tableContext !== container + ) { + if (tableContext.getRootNode() instanceof ShadowRoot) { + tableContext = tableContext.getRootNode().host; + continue; } - if (tableContext) { - // Get rows from all contexts - const rows = []; - - // Get rows from regular DOM - rows.push(...tableContext.getElementsByTagName('TR')); - - // Get rows from shadow DOM - if (tableContext.shadowRoot) { - rows.push(...tableContext.shadowRoot.getElementsByTagName('TR')); - } - - // Get rows from iframes and frames - if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') { - try { - const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document; - rows.push(...frameDoc.getElementsByTagName('TR')); - } catch (e) { - console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e); + if ( + tableContext.tagName === "IFRAME" || + tableContext.tagName === "FRAME" + ) { + try { + tableContext = tableContext.contentDocument.body; + } catch (e) { + break; + } + } else { + tableContext = tableContext.parentElement; + } + } + + if (tableContext) { + // Get rows from all contexts + const rows = []; + + // Get rows from regular DOM + rows.push(...tableContext.getElementsByTagName("TR")); + + // Get rows from shadow DOM + if (tableContext.shadowRoot) { + rows.push(...tableContext.shadowRoot.getElementsByTagName("TR")); + } + + // Get rows from iframes and frames + if ( + tableContext.tagName === "IFRAME" || + tableContext.tagName === "FRAME" + ) { + try { + const frameDoc = + tableContext.contentDocument || + tableContext.contentWindow.document; + rows.push(...frameDoc.getElementsByTagName("TR")); + } catch (e) { + console.warn( + `Cannot access ${tableContext.tagName.toLowerCase()} rows:`, + e + ); + } + } + + const processedRows = filterRowsBasedOnTag(rows, tableFields); + + for ( + let rowIndex = 0; + rowIndex < Math.min(processedRows.length, limit); + rowIndex++ + ) { + const record = {}; + const currentRow = processedRows[rowIndex]; + + for (const [ + label, + { selector, attribute, cellIndex }, + ] of Object.entries(tableFields)) { + let element = null; + + if (cellIndex >= 0) { + // Get TD element considering both contexts + let td = currentRow.children[cellIndex]; + + // Check shadow DOM for td + if (!td && currentRow.shadowRoot) { + const shadowCells = currentRow.shadowRoot.children; + if (shadowCells && shadowCells.length > cellIndex) { + td = shadowCells[cellIndex]; } - } - - const processedRows = filterRowsBasedOnTag(rows, tableFields); - - for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { - const record = {}; - const currentRow = processedRows[rowIndex]; - - for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { - let element = null; - - if (cellIndex >= 0) { - // Get TD element considering both contexts - let td = currentRow.children[cellIndex]; - - // Check shadow DOM for td - if (!td && currentRow.shadowRoot) { - const shadowCells = currentRow.shadowRoot.children; - if (shadowCells && shadowCells.length > cellIndex) { - td = shadowCells[cellIndex]; - } - } - - if (td) { - element = queryElement(td, selector); - - if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) { - element = td; - } - - if (!element) { - const tagOnlySelector = selector.split('.')[0]; - element = queryElement(td, tagOnlySelector); - } - - if (!element) { - let currentElement = td; - while (currentElement && currentElement.children.length > 0) { - let foundContentChild = false; - for (const child of currentElement.children) { - if (extractValue(child, attribute)) { - currentElement = child; - foundContentChild = true; - break; - } - } - if (!foundContentChild) break; - } - element = currentElement; - } - } - } else { - element = queryElement(currentRow, selector); - } - - if (element) { - record[label] = extractValue(element, attribute); - } + } + + if (td) { + element = queryElement(td, selector); + + if ( + !element && + selector + .split(/(?:>>|:>>)/) + .pop() + .includes("td:nth-child") + ) { + element = td; + } + + if (!element) { + const tagOnlySelector = selector.split(".")[0]; + element = queryElement(td, tagOnlySelector); } - if (Object.keys(record).length > 0) { - tableData.push(record); + if (!element) { + let currentElement = td; + while ( + currentElement && + currentElement.children.length > 0 + ) { + let foundContentChild = false; + for (const child of currentElement.children) { + if (extractValue(child, attribute)) { + currentElement = child; + foundContentChild = true; + break; + } + } + if (!foundContentChild) break; + } + element = currentElement; } + } + } else { + element = queryElement(currentRow, selector); + } + + if (element) { + record[label] = extractValue(element, attribute); } + } + + if (Object.keys(record).length > 0) { + tableData.push(record); + } } + } } } // Process non-table data with all contexts support - for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + for ( + let containerIndex = 0; + containerIndex < containers.length; + containerIndex++ + ) { if (nonTableData.length >= limit) break; const container = containers[containerIndex]; const { nonTableFields } = containerFields[containerIndex]; if (Object.keys(nonTableFields).length > 0) { - const record = {}; + const record = {}; - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - // Get the last part of the selector after any context delimiter - const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0]; - const element = tryFallbackSelector(container, relativeSelector); - - if (element) { - record[label] = extractValue(element, attribute); - } - } - - if (Object.keys(record).length > 0) { - nonTableData.push(record); + for (const [label, { selector, attribute }] of Object.entries( + nonTableFields + )) { + // Get the last part of the selector after any context delimiter + const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0]; + const element = tryFallbackSelector(container, relativeSelector); + + if (element) { + record[label] = extractValue(element, attribute); } - } + } + + if (Object.keys(record).length > 0) { + nonTableData.push(record); + } + } } - + // Merge and limit the results const scrapedData = [...tableData, ...nonTableData]; + console.log(`📊 Total records extracted: ${scrapedData.length}`); + return scrapedData; }; diff --git a/server/src/browser-management/classes/RemoteBrowser.ts b/server/src/browser-management/classes/RemoteBrowser.ts index e5c974dae..51f33574e 100644 --- a/server/src/browser-management/classes/RemoteBrowser.ts +++ b/server/src/browser-management/classes/RemoteBrowser.ts @@ -243,7 +243,7 @@ export class RemoteBrowser { scripts: [] as Array<{ src: string; content: string; type?: string }>, media: [] as Array<{ src: string; dataUrl: string; type: string }>, }; - + const viewport = (await this.currentPage?.viewportSize()) || { width: 1280, height: 720, @@ -617,7 +617,7 @@ export class RemoteBrowser { ); await this.context.addInitScript({ path: './server/src/browser-management/classes/rrweb-bundle.js' }); - + this.currentPage = await this.context.newPage(); await this.setupPageEventListeners(this.currentPage); @@ -1286,129 +1286,181 @@ export class RemoteBrowser { * @returns void */ public registerEditorEvents = (): void => { - // For each event, include userId to make sure events are handled for the correct browser - logger.log('debug', `Registering editor events for user: ${this.userId}`); + // For each event, include userId to make sure events are handled for the correct browser + logger.log("debug", `Registering editor events for user: ${this.userId}`); - this.socket.on(`captureDirectScreenshot:${this.userId}`, async (settings) => { - logger.debug(`Direct screenshot capture requested for user ${this.userId}`); + this.socket.on( + `captureDirectScreenshot:${this.userId}`, + async (settings) => { + logger.debug( + `Direct screenshot capture requested for user ${this.userId}` + ); await this.captureDirectScreenshot(settings); - }); + } + ); - // For backward compatibility - this.socket.on('captureDirectScreenshot', async (settings) => { - await this.captureDirectScreenshot(settings); - }); - - // Listen for specific events for this user - this.socket.on(`rerender:${this.userId}`, async () => { - logger.debug(`Rerender event received for user ${this.userId}`); - await this.makeAndEmitScreenshot(); - }); - - // For backward compatibility, also listen to the general event - this.socket.on('rerender', async () => { - logger.debug(`General rerender event received, checking if for user ${this.userId}`); - await this.makeAndEmitScreenshot(); - }); - - this.socket.on(`settings:${this.userId}`, (settings) => { - this.interpreterSettings = settings; - logger.debug(`Settings updated for user ${this.userId}`); - }); - - this.socket.on(`changeTab:${this.userId}`, async (tabIndex) => { - logger.debug(`Tab change to ${tabIndex} requested for user ${this.userId}`); - await this.changeTab(tabIndex); - }); - - this.socket.on(`addTab:${this.userId}`, async () => { - logger.debug(`New tab requested for user ${this.userId}`); - await this.currentPage?.context().newPage(); - const lastTabIndex = this.currentPage ? this.currentPage.context().pages().length - 1 : 0; - await this.changeTab(lastTabIndex); - }); - - this.socket.on(`closeTab:${this.userId}`, async (tabInfo) => { - logger.debug(`Close tab ${tabInfo.index} requested for user ${this.userId}`); - const page = this.currentPage?.context().pages()[tabInfo.index]; - if (page) { - if (tabInfo.isCurrent) { - if (this.currentPage?.context().pages()[tabInfo.index + 1]) { - // next tab - await this.changeTab(tabInfo.index + 1); - } else { - //previous tab - await this.changeTab(tabInfo.index - 1); - } - } - await page.close(); - logger.log( - 'debug', - `Tab ${tabInfo.index} was closed for user ${this.userId}, new tab count: ${this.currentPage?.context().pages().length}` - ); + // For backward compatibility + this.socket.on("captureDirectScreenshot", async (settings) => { + await this.captureDirectScreenshot(settings); + }); + + // Listen for specific events for this user + this.socket.on(`rerender:${this.userId}`, async () => { + logger.debug(`Rerender event received for user ${this.userId}`); + if (this.renderingMode === "dom") { + await this.makeAndEmitDOMSnapshot(); + } else { + await this.makeAndEmitScreenshot(); + } + }); + + this.socket.on("rerender", async () => { + logger.debug( + `General rerender event received, checking if for user ${this.userId}` + ); + if (this.renderingMode === "dom") { + await this.makeAndEmitDOMSnapshot(); + } else { + await this.makeAndEmitScreenshot(); + } + }); + + this.socket.on(`settings:${this.userId}`, (settings) => { + this.interpreterSettings = settings; + logger.debug(`Settings updated for user ${this.userId}`); + }); + + this.socket.on(`changeTab:${this.userId}`, async (tabIndex) => { + logger.debug( + `Tab change to ${tabIndex} requested for user ${this.userId}` + ); + await this.changeTab(tabIndex); + }); + + this.socket.on(`addTab:${this.userId}`, async () => { + logger.debug(`New tab requested for user ${this.userId}`); + await this.currentPage?.context().newPage(); + const lastTabIndex = this.currentPage + ? this.currentPage.context().pages().length - 1 + : 0; + await this.changeTab(lastTabIndex); + }); + + this.socket.on(`closeTab:${this.userId}`, async (tabInfo) => { + logger.debug( + `Close tab ${tabInfo.index} requested for user ${this.userId}` + ); + const page = this.currentPage?.context().pages()[tabInfo.index]; + if (page) { + if (tabInfo.isCurrent) { + if (this.currentPage?.context().pages()[tabInfo.index + 1]) { + // next tab + await this.changeTab(tabInfo.index + 1); } else { - logger.log('error', `Tab index ${tabInfo.index} out of range for user ${this.userId}`); - } - }); - - this.socket.on(`setViewportSize:${this.userId}`, async (data: { width: number, height: number }) => { - const { width, height } = data; - logger.log('debug', `Viewport size change to width=${width}, height=${height} requested for user ${this.userId}`); - - // Update the browser context's viewport dynamically - if (this.context && this.browser) { - this.context = await this.browser.newContext({ viewport: { width, height } }); - logger.log('debug', `Viewport size updated to width=${width}, height=${height} for user ${this.userId}`); + //previous tab + await this.changeTab(tabInfo.index - 1); } - }); - - // For backward compatibility, also register the standard events - this.socket.on('settings', (settings) => this.interpreterSettings = settings); - this.socket.on('changeTab', async (tabIndex) => await this.changeTab(tabIndex)); - this.socket.on('addTab', async () => { - await this.currentPage?.context().newPage(); - const lastTabIndex = this.currentPage ? this.currentPage.context().pages().length - 1 : 0; - await this.changeTab(lastTabIndex); - }); - this.socket.on('closeTab', async (tabInfo) => { - const page = this.currentPage?.context().pages()[tabInfo.index]; - if (page) { - if (tabInfo.isCurrent) { - if (this.currentPage?.context().pages()[tabInfo.index + 1]) { - await this.changeTab(tabInfo.index + 1); - } else { - await this.changeTab(tabInfo.index - 1); - } - } - await page.close(); - } - }); - this.socket.on('setViewportSize', async (data: { width: number, height: number }) => { - const { width, height } = data; - if (this.context && this.browser) { - this.context = await this.browser.newContext({ viewport: { width, height } }); + } + await page.close(); + logger.log( + "debug", + `Tab ${tabInfo.index} was closed for user ${ + this.userId + }, new tab count: ${this.currentPage?.context().pages().length}` + ); + } else { + logger.log( + "error", + `Tab index ${tabInfo.index} out of range for user ${this.userId}` + ); + } + }); + + this.socket.on( + `setViewportSize:${this.userId}`, + async (data: { width: number; height: number }) => { + const { width, height } = data; + logger.log( + "debug", + `Viewport size change to width=${width}, height=${height} requested for user ${this.userId}` + ); + + // Update the browser context's viewport dynamically + if (this.context && this.browser) { + this.context = await this.browser.newContext({ + viewport: { width, height }, + }); + logger.log( + "debug", + `Viewport size updated to width=${width}, height=${height} for user ${this.userId}` + ); + } + } + ); + + // For backward compatibility, also register the standard events + this.socket.on( + "settings", + (settings) => (this.interpreterSettings = settings) + ); + this.socket.on( + "changeTab", + async (tabIndex) => await this.changeTab(tabIndex) + ); + this.socket.on("addTab", async () => { + await this.currentPage?.context().newPage(); + const lastTabIndex = this.currentPage + ? this.currentPage.context().pages().length - 1 + : 0; + await this.changeTab(lastTabIndex); + }); + this.socket.on("closeTab", async (tabInfo) => { + const page = this.currentPage?.context().pages()[tabInfo.index]; + if (page) { + if (tabInfo.isCurrent) { + if (this.currentPage?.context().pages()[tabInfo.index + 1]) { + await this.changeTab(tabInfo.index + 1); + } else { + await this.changeTab(tabInfo.index - 1); } - }); + } + await page.close(); + } + }); + this.socket.on( + "setViewportSize", + async (data: { width: number; height: number }) => { + const { width, height } = data; + if (this.context && this.browser) { + this.context = await this.browser.newContext({ + viewport: { width, height }, + }); + } + } + ); - this.socket.on('extractListData', async (data: { - listSelector: string, - fields: Record, - currentListId: number, - pagination: any + this.socket.on( + "extractListData", + async (data: { + listSelector: string; + fields: Record; + currentListId: number; + pagination: any; }) => { - if (this.currentPage) { - const extractedData = await this.extractListData( - this.currentPage, - data.listSelector, - data.fields - ); - - this.socket.emit('listDataExtracted', { - currentListId: data.currentListId, - data: extractedData - }); - } - }); + if (this.currentPage) { + const extractedData = await this.extractListData( + this.currentPage, + data.listSelector, + data.fields + ); + + this.socket.emit("listDataExtracted", { + currentListId: data.currentListId, + data: extractedData, + }); + } + } + ); }; /** * Subscribes the remote browser for a screencast session @@ -1476,15 +1528,12 @@ export class RemoteBrowser { this.isDOMStreamingActive = false; } } - + /** * CDP-based DOM snapshot creation using captured network resources */ public async makeAndEmitDOMSnapshot(): Promise { - if ( - !this.currentPage || - !this.isDOMStreamingActive - ) { + if (!this.currentPage || !this.isDOMStreamingActive) { return; } @@ -1537,10 +1586,11 @@ export class RemoteBrowser { if (typeof window.rrwebSnapshot === "undefined") { throw new Error("rrweb-snapshot library not available"); } - return window.rrwebSnapshot.snapshot(document, { - inlineImages: true, - collectFonts: true, - }); + + return window.rrwebSnapshot.snapshot(document, { + inlineImages: true, + collectFonts: true, + }); }); // Process the snapshot to proxy resources @@ -1557,10 +1607,12 @@ export class RemoteBrowser { this.emitRRWebSnapshot(enhancedSnapshot); } catch (error) { // Handle navigation context destruction gracefully - if (error instanceof Error && - (error.message.includes("Execution context was destroyed") || + if ( + error instanceof Error && + (error.message.includes("Execution context was destroyed") || error.message.includes("most likely because of a navigation") || - error.message.includes("Target closed"))) { + error.message.includes("Target closed")) + ) { logger.debug("DOM snapshot skipped due to page navigation or closure"); return; // Don't emit error for navigation - this is expected } @@ -1622,35 +1674,35 @@ export class RemoteBrowser { * @returns {Promise} */ public async switchOff(): Promise { - try { - this.isScreencastActive = false; - this.isDOMStreamingActive = false; + try { + this.isScreencastActive = false; + this.isDOMStreamingActive = false; - await this.interpreter.stopInterpretation(); + await this.interpreter.stopInterpretation(); - if (this.screencastInterval) { - clearInterval(this.screencastInterval); - } + if (this.screencastInterval) { + clearInterval(this.screencastInterval); + } - if (this.domUpdateInterval) { - clearInterval(this.domUpdateInterval); - } + if (this.domUpdateInterval) { + clearInterval(this.domUpdateInterval); + } - if (this.client) { - await this.stopScreencast(); - await this.stopDOM(); - } + if (this.client) { + await this.stopScreencast(); + await this.stopDOM(); + } - if (this.browser) { - await this.browser.close(); - } + if (this.browser) { + await this.browser.close(); + } - this.screenshotQueue = []; - //this.performanceMonitor.reset(); + this.screenshotQueue = []; + //this.performanceMonitor.reset(); - } catch (error) { - logger.error('Error during browser shutdown:', error); - } + } catch (error) { + logger.error('Error during browser shutdown:', error); + } } private async optimizeScreenshot(screenshot: Buffer): Promise { @@ -1772,6 +1824,7 @@ export class RemoteBrowser { const page = this.currentPage?.context().pages()[tabIndex]; if (page) { await this.stopScreencast(); + await this.stopDOM(); this.currentPage = page; await this.setupPageEventListeners(this.currentPage); @@ -1783,8 +1836,13 @@ export class RemoteBrowser { url: this.currentPage.url(), userId: this.userId }); - await this.makeAndEmitScreenshot(); - await this.subscribeToScreencast(); + if (this.isDOMStreamingActive) { + await this.makeAndEmitDOMSnapshot(); + await this.subscribeToDOM(); + } else { + await this.makeAndEmitScreenshot(); + await this.subscribeToScreencast(); + } } else { logger.log('error', `${tabIndex} index out of range of pages`) } diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index 2de919825..eb0e33a72 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -464,7 +464,6 @@ export class WorkflowGenerator { public onClick = async (coordinates: Coordinates, page: Page) => { let where: WhereWhatPair["where"] = { url: this.getBestUrl(page.url()) }; const selector = await this.generateSelector(page, coordinates, ActionType.Click); - console.log("COOORDINATES: ", coordinates); logger.log('debug', `Element's selector: ${selector}`); const elementInfo = await getElementInformation(page, coordinates, '', false); @@ -999,6 +998,7 @@ export class WorkflowGenerator { rect, selector: displaySelector, elementInfo, + isDOMMode: this.isDOMMode, // Include shadow DOM specific information shadowInfo: elementInfo?.isShadowRoot ? { mode: elementInfo.shadowRootMode, diff --git a/src/components/browser/BrowserWindow.tsx b/src/components/browser/BrowserWindow.tsx index 9d11d3025..d3143f32c 100644 --- a/src/components/browser/BrowserWindow.tsx +++ b/src/components/browser/BrowserWindow.tsx @@ -11,7 +11,7 @@ import { useTranslation } from 'react-i18next'; import { AuthContext } from '../../context/auth'; import { coordinateMapper } from '../../helpers/coordinateMapper'; import { useBrowserDimensionsStore } from '../../context/browserDimensions'; -import { clientSelectorGenerator } from "../../helpers/clientSelectorGenerator"; +import { clientSelectorGenerator, ElementFingerprint } from "../../helpers/clientSelectorGenerator"; import DatePicker from "../pickers/DatePicker"; import Dropdown from "../pickers/Dropdown"; import TimePicker from "../pickers/TimePicker"; @@ -147,15 +147,14 @@ export const BrowserWindow = () => { const { browserWidth, browserHeight } = useBrowserDimensionsStore(); const [canvasRef, setCanvasReference] = useState | undefined>(undefined); const [screenShot, setScreenShot] = useState(""); - const [highlighterData, setHighlighterData] = useState<{ rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[] } | null>(null); + const [highlighterData, setHighlighterData] = useState<{ rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[], groupElements?: Array<{ element: HTMLElement; rect: DOMRect } >} | null>(null); const [showAttributeModal, setShowAttributeModal] = useState(false); const [attributeOptions, setAttributeOptions] = useState([]); const [selectedElement, setSelectedElement] = useState<{ selector: string, info: ElementInfo | null } | null>(null); const [currentListId, setCurrentListId] = useState(null); const [viewportInfo, setViewportInfo] = useState({ width: browserWidth, height: browserHeight }); - const [isDOMMode, setIsDOMMode] = useState(false); - const [currentSnapshot, setCurrentSnapshot] = useState(null); const [isLoading, setIsLoading] = useState(false); + const [cachedChildSelectors, setCachedChildSelectors] = useState([]); const [listSelector, setListSelector] = useState(null); const [fields, setFields] = useState>({}); @@ -164,9 +163,15 @@ export const BrowserWindow = () => { const highlighterUpdateRef = useRef(0); const { socket } = useSocketStore(); - const { notify, currentTextActionId, currentListActionId } = useGlobalInfoStore(); + const { notify, currentTextActionId, currentListActionId, updateDOMMode, isDOMMode, currentSnapshot } = useGlobalInfoStore(); const { getText, getList, paginationMode, paginationType, limitMode, captureStage } = useActionContext(); const { addTextStep, addListStep, updateListStepData } = useBrowserSteps(); + + const [currentGroupInfo, setCurrentGroupInfo] = useState<{ + isGroupElement: boolean; + groupSize: number; + groupElements: HTMLElement[]; + } | null>(null); const { state } = useContext(AuthContext); const { user } = state; @@ -243,51 +248,47 @@ export const BrowserWindow = () => { (data: RRWebDOMCastData) => { if (!data.userId || data.userId === user?.id) { if (data.snapshotData && data.snapshotData.snapshot) { - setCurrentSnapshot(data.snapshotData); - setIsDOMMode(true); + updateDOMMode(true, data.snapshotData); socket?.emit("dom-mode-enabled"); - setIsLoading(false); } else { setIsLoading(false); } } }, - [user?.id, socket] + [user?.id, socket, updateDOMMode] ); const domModeHandler = useCallback( (data: any) => { if (!data.userId || data.userId === user?.id) { - setIsDOMMode(true); + updateDOMMode(true); socket?.emit("dom-mode-enabled"); setIsLoading(false); } }, - [user?.id, socket] + [user?.id, socket, updateDOMMode] ); const screenshotModeHandler = useCallback( (data: any) => { if (!data.userId || data.userId === user?.id) { - setIsDOMMode(false); + updateDOMMode(false); socket?.emit("screenshot-mode-enabled"); - setCurrentSnapshot(null); setIsLoading(false); } }, - [user?.id] + [user?.id, updateDOMMode] ); const domModeErrorHandler = useCallback( (data: any) => { if (!data.userId || data.userId === user?.id) { - setIsDOMMode(false); - setCurrentSnapshot(null); + updateDOMMode(false); setIsLoading(false); } }, - [user?.id] + [user?.id, updateDOMMode] ); useEffect(() => { @@ -304,8 +305,23 @@ export const BrowserWindow = () => { socket?.emit("listSelector", { selector: listSelector }); clientSelectorGenerator.setListSelector(listSelector); + + setCachedChildSelectors([]); + + if (currentSnapshot) { + const iframeElement = document.querySelector( + "#dom-browser-iframe" + ) as HTMLIFrameElement; + if (iframeElement?.contentDocument) { + const childSelectors = clientSelectorGenerator.getChildSelectors( + iframeElement.contentDocument, + listSelector + ); + setCachedChildSelectors(childSelectors); + } + } } - }, [isDOMMode, listSelector, socket, getList]); + }, [isDOMMode, listSelector, socket, getList, currentSnapshot]); useEffect(() => { coordinateMapper.updateDimensions(dimensions.width, dimensions.height, viewportInfo.width, viewportInfo.height); @@ -345,6 +361,7 @@ export const BrowserWindow = () => { setListSelector(null); setFields({}); setCurrentListId(null); + setCachedChildSelectors([]); }, []); useEffect(() => { @@ -372,7 +389,7 @@ export const BrowserWindow = () => { socket.on("screencast", screencastHandler); socket.on("domcast", rrwebSnapshotHandler); socket.on("dom-mode-enabled", domModeHandler); - socket.on("screenshot-mode-enabled", screenshotModeHandler); + // socket.on("screenshot-mode-enabled", screenshotModeHandler); socket.on("dom-mode-error", domModeErrorHandler); } @@ -386,7 +403,7 @@ export const BrowserWindow = () => { socket.off("screencast", screencastHandler); socket.off("domcast", rrwebSnapshotHandler); socket.off("dom-mode-enabled", domModeHandler); - socket.off("screenshot-mode-enabled", screenshotModeHandler); + // socket.off("screenshot-mode-enabled", screenshotModeHandler); socket.off("dom-mode-error", domModeErrorHandler); } }; @@ -398,7 +415,7 @@ export const BrowserWindow = () => { screencastHandler, rrwebSnapshotHandler, domModeHandler, - screenshotModeHandler, + // screenshotModeHandler, domModeErrorHandler, ]); @@ -408,8 +425,19 @@ export const BrowserWindow = () => { selector: string; elementInfo: ElementInfo | null; childSelectors?: string[]; + groupInfo?: { + isGroupElement: boolean; + groupSize: number; + groupElements: HTMLElement[]; + groupFingerprint: ElementFingerprint; + }; isDOMMode?: boolean; }) => { + if (!getText && !getList) { + setHighlighterData(null); + return; + } + if (!isDOMMode || !currentSnapshot) { return; } @@ -420,19 +448,10 @@ export const BrowserWindow = () => { if (!iframeElement) { iframeElement = document.querySelector( - "#browser-window iframe" + "#browser-window iframe" ) as HTMLIFrameElement; } - if (!iframeElement) { - const browserWindow = document.querySelector("#browser-window"); - if (browserWindow) { - iframeElement = browserWindow.querySelector( - "iframe" - ) as HTMLIFrameElement; - } - } - if (!iframeElement) { console.error("Could not find iframe element for DOM highlighting"); return; @@ -441,6 +460,12 @@ export const BrowserWindow = () => { const iframeRect = iframeElement.getBoundingClientRect(); const IFRAME_BODY_PADDING = 16; + if (data.groupInfo) { + setCurrentGroupInfo(data.groupInfo); + } else { + setCurrentGroupInfo(null); + } + const absoluteRect = new DOMRect( data.rect.x + iframeRect.left - IFRAME_BODY_PADDING, data.rect.y + iframeRect.top - IFRAME_BODY_PADDING, @@ -451,12 +476,36 @@ export const BrowserWindow = () => { const mappedData = { ...data, rect: absoluteRect, + childSelectors: data.childSelectors || cachedChildSelectors, }; if (getList === true) { - if (listSelector) { - socket?.emit("listSelector", { selector: listSelector }); - const hasValidChildSelectors = + if (!listSelector && data.groupInfo?.isGroupElement) { + const updatedGroupElements = data.groupInfo.groupElements.map( + (element) => { + const elementRect = element.getBoundingClientRect(); + return { + element, + rect: new DOMRect( + elementRect.x + iframeRect.left - IFRAME_BODY_PADDING, + elementRect.y + iframeRect.top - IFRAME_BODY_PADDING, + elementRect.width, + elementRect.height + ), + }; + } + ); + + const mappedData = { + ...data, + rect: absoluteRect, + groupElements: updatedGroupElements, + childSelectors: data.childSelectors || cachedChildSelectors, + }; + + setHighlighterData(mappedData); + } else if (listSelector) { + const hasChildSelectors = Array.isArray(mappedData.childSelectors) && mappedData.childSelectors.length > 0; @@ -464,69 +513,15 @@ export const BrowserWindow = () => { setHighlighterData(null); } else if (paginationMode) { if ( - paginationType !== "" && - !["none", "scrollDown", "scrollUp"].includes(paginationType) + paginationType !== "" && + !["none", "scrollDown", "scrollUp"].includes(paginationType) ) { setHighlighterData(mappedData); } else { setHighlighterData(null); } - } else if ( - mappedData.childSelectors && - mappedData.childSelectors.includes(mappedData.selector) - ) { + } else if (hasChildSelectors) { setHighlighterData(mappedData); - } else if ( - mappedData.elementInfo?.isIframeContent && - mappedData.childSelectors - ) { - const isIframeChild = mappedData.childSelectors.some( - (childSelector) => - mappedData.selector.includes(":>>") && - childSelector - .split(":>>") - .some((part) => mappedData.selector.includes(part.trim())) - ); - setHighlighterData(isIframeChild ? mappedData : null); - } else if ( - mappedData.selector.includes(":>>") && - hasValidChildSelectors - ) { - const selectorParts = mappedData.selector - .split(":>>") - .map((part) => part.trim()); - const isValidMixedSelector = selectorParts.some((part) => - mappedData.childSelectors!.some((childSelector) => - childSelector.includes(part) - ) - ); - setHighlighterData(isValidMixedSelector ? mappedData : null); - } else if ( - mappedData.elementInfo?.isShadowRoot && - mappedData.childSelectors - ) { - const isShadowChild = mappedData.childSelectors.some( - (childSelector) => - mappedData.selector.includes(">>") && - childSelector - .split(">>") - .some((part) => mappedData.selector.includes(part.trim())) - ); - setHighlighterData(isShadowChild ? mappedData : null); - } else if ( - mappedData.selector.includes(">>") && - hasValidChildSelectors - ) { - const selectorParts = mappedData.selector - .split(">>") - .map((part) => part.trim()); - const isValidMixedSelector = selectorParts.some((part) => - mappedData.childSelectors!.some((childSelector) => - childSelector.includes(part) - ) - ); - - setHighlighterData(isValidMixedSelector ? mappedData : null); } else { setHighlighterData(null); } @@ -534,23 +529,29 @@ export const BrowserWindow = () => { setHighlighterData(mappedData); } } else { - // getText mode setHighlighterData(mappedData); } }, [ isDOMMode, currentSnapshot, + getText, getList, socket, listSelector, paginationMode, paginationType, limitMode, + cachedChildSelectors, ] ); - const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[] }) => { + const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[], isDOMMode?: boolean; }) => { + if (isDOMMode || data.isDOMMode) { + domHighlighterHandler(data); + return; + } + const now = performance.now(); if (now - highlighterUpdateRef.current < 16) { return; @@ -652,6 +653,20 @@ export const BrowserWindow = () => { }; }, [socket, highlighterHandler, onMouseMove, getList, listSelector]); + useEffect(() => { + document.addEventListener("mousemove", onMouseMove, false); + if (socket) { + socket.off("highlighter", highlighterHandler); + socket.on("highlighter", highlighterHandler); + } + return () => { + document.removeEventListener("mousemove", onMouseMove); + if (socket) { + socket.off("highlighter", highlighterHandler); + } + }; + }, [socket, highlighterHandler, getList, listSelector]); + useEffect(() => { if (socket && listSelector) { console.log('Syncing list selector with server:', listSelector); @@ -668,312 +683,355 @@ export const BrowserWindow = () => { }, [captureStage, listSelector, socket]); const handleDOMElementSelection = useCallback( - (highlighterData: { - rect: DOMRect; - selector: string; - elementInfo: ElementInfo | null; - childSelectors?: string[]; - }) => { - setShowAttributeModal(false); - setSelectedElement(null); - setAttributeOptions([]); + (highlighterData: { + rect: DOMRect; + selector: string; + elementInfo: ElementInfo | null; + childSelectors?: string[]; + groupInfo?: { + isGroupElement: boolean; + groupSize: number; + groupElements: HTMLElement[]; + }; + }) => { + setShowAttributeModal(false); + setSelectedElement(null); + setAttributeOptions([]); + + if (paginationMode && getList) { + if ( + paginationType !== "" && + paginationType !== "scrollDown" && + paginationType !== "scrollUp" && + paginationType !== "none" + ) { + setPaginationSelector(highlighterData.selector); + notify( + `info`, + t( + "browser_window.attribute_modal.notifications.pagination_select_success" + ) + ); + addListStep( + listSelector!, + fields, + currentListId || 0, + currentListActionId || `list-${crypto.randomUUID()}`, + { type: paginationType, selector: highlighterData.selector } + ); + socket?.emit("setPaginationMode", { pagination: false }); + } + return; + } - const options = getAttributeOptions( + if ( + getList === true && + !listSelector && + highlighterData.groupInfo?.isGroupElement + ) { + let cleanedSelector = highlighterData.selector; + + setListSelector(cleanedSelector); + notify( + `info`, + t( + "browser_window.attribute_modal.notifications.list_select_success", + { + count: highlighterData.groupInfo.groupSize, + } + ) || + `Selected group with ${highlighterData.groupInfo.groupSize} similar elements` + ); + setCurrentListId(Date.now()); + setFields({}); + + socket?.emit("setGetList", { getList: true }); + socket?.emit("listSelector", { selector: cleanedSelector }); + + return; + } + + if (getList === true && listSelector && currentListId) { + const options = getAttributeOptions( highlighterData.elementInfo?.tagName || "", highlighterData.elementInfo - ); + ); + + if (options.length === 1) { + const attribute = options[0].value; + let currentSelector = highlighterData.selector; + + const data = + attribute === "href" + ? highlighterData.elementInfo?.url || "" + : attribute === "src" + ? highlighterData.elementInfo?.imageUrl || "" + : highlighterData.elementInfo?.innerText || ""; + + const newField: TextStep = { + id: Date.now(), + type: "text", + label: `Label ${Object.keys(fields).length + 1}`, + data: data, + selectorObj: { + selector: currentSelector, + tag: highlighterData.elementInfo?.tagName, + shadow: highlighterData.elementInfo?.isShadowRoot, + attribute, + }, + }; + + const updatedFields = { + ...fields, + [newField.id]: newField, + }; + + setFields(updatedFields); + + if (listSelector) { + addListStep( + listSelector, + updatedFields, + currentListId, + currentListActionId || `list-${crypto.randomUUID()}`, + { type: "", selector: paginationSelector } + ); + } + } else { + setAttributeOptions(options); + setSelectedElement({ + selector: highlighterData.selector, + info: highlighterData.elementInfo, + }); + setShowAttributeModal(true); + } + return; + } if (getText === true) { + const options = getAttributeOptions( + highlighterData.elementInfo?.tagName || "", + highlighterData.elementInfo + ); + + if (options.length === 1) { + const attribute = options[0].value; + const data = + attribute === "href" + ? highlighterData.elementInfo?.url || "" + : attribute === "src" + ? highlighterData.elementInfo?.imageUrl || "" + : highlighterData.elementInfo?.innerText || ""; + + addTextStep( + "", + data, + { + selector: highlighterData.selector, + tag: highlighterData.elementInfo?.tagName, + shadow: highlighterData.elementInfo?.isShadowRoot, + attribute, + }, + currentTextActionId || `text-${crypto.randomUUID()}` + ); + } else { + setAttributeOptions(options); + setSelectedElement({ + selector: highlighterData.selector, + info: highlighterData.elementInfo, + }); + setShowAttributeModal(true); + } + } + }, + [ + getText, + getList, + listSelector, + paginationMode, + paginationType, + limitMode, + fields, + currentListId, + currentTextActionId, + currentListActionId, + addTextStep, + addListStep, + notify, + socket, + t, + paginationSelector, + ] + ); + + + const handleClick = (e: React.MouseEvent) => { + if (highlighterData) { + let shouldProcessClick = false; + + if (!isDOMMode && canvasRef?.current) { + const canvasRect = canvasRef.current.getBoundingClientRect(); + const clickX = e.clientX - canvasRect.left; + const clickY = e.clientY - canvasRect.top; + const highlightRect = highlighterData.rect; + const mappedRect = + coordinateMapper.mapBrowserRectToCanvas(highlightRect); + + shouldProcessClick = + clickX >= mappedRect.left && + clickX <= mappedRect.right && + clickY >= mappedRect.top && + clickY <= mappedRect.bottom; + } else { + shouldProcessClick = true; + } + + if (shouldProcessClick) { + const options = getAttributeOptions( + highlighterData.elementInfo?.tagName || "", + highlighterData.elementInfo + ); + + if (getText === true) { if (options.length === 1) { - const attribute = options[0].value; - const data = - attribute === "href" - ? highlighterData.elementInfo?.url || "" - : attribute === "src" - ? highlighterData.elementInfo?.imageUrl || "" - : highlighterData.elementInfo?.innerText || ""; - - addTextStep( - "", - data, - { - selector: highlighterData.selector, - tag: highlighterData.elementInfo?.tagName, - shadow: highlighterData.elementInfo?.isShadowRoot, - attribute, - }, - currentTextActionId || `text-${crypto.randomUUID()}` - ); + const attribute = options[0].value; + const data = + attribute === "href" + ? highlighterData.elementInfo?.url || "" + : attribute === "src" + ? highlighterData.elementInfo?.imageUrl || "" + : highlighterData.elementInfo?.innerText || ""; + + addTextStep( + "", + data, + { + selector: highlighterData.selector, + tag: highlighterData.elementInfo?.tagName, + shadow: highlighterData.elementInfo?.isShadowRoot, + attribute, + }, + currentTextActionId || `text-${crypto.randomUUID()}` + ); } else { - setAttributeOptions(options); - setSelectedElement({ - selector: highlighterData.selector, - info: highlighterData.elementInfo, - }); - setShowAttributeModal(true); + setAttributeOptions(options); + setSelectedElement({ + selector: highlighterData.selector, + info: highlighterData.elementInfo, + }); + setShowAttributeModal(true); } - } + } - if (paginationMode && getList) { + if (paginationMode && getList) { if ( - paginationType !== "" && - paginationType !== "scrollDown" && - paginationType !== "scrollUp" && - paginationType !== "none" + paginationType !== "" && + paginationType !== "scrollDown" && + paginationType !== "scrollUp" && + paginationType !== "none" ) { - setPaginationSelector(highlighterData.selector); - notify( - `info`, - t( - "browser_window.attribute_modal.notifications.pagination_select_success" - ) - ); - addListStep( - listSelector!, - fields, - currentListId || 0, - currentListActionId || `list-${crypto.randomUUID()}`, - { type: paginationType, selector: highlighterData.selector } - ); - socket?.emit("setPaginationMode", { pagination: false }); + setPaginationSelector(highlighterData.selector); + notify( + `info`, + t( + "browser_window.attribute_modal.notifications.pagination_select_success" + ) + ); + addListStep( + listSelector!, + fields, + currentListId || 0, + currentListActionId || `list-${crypto.randomUUID()}`, + { type: paginationType, selector: highlighterData.selector } + ); + socket?.emit("setPaginationMode", { pagination: false }); } return; - } + } - if (getList === true && !listSelector) { + if (getList === true && !listSelector) { let cleanedSelector = highlighterData.selector; - if (cleanedSelector.includes("nth-child")) { - cleanedSelector = cleanedSelector.replace(/:nth-child\(\d+\)/g, ""); + if ( + cleanedSelector.includes("[") && + cleanedSelector.match(/\[\d+\]/) + ) { + cleanedSelector = cleanedSelector.replace(/\[\d+\]/g, ""); } setListSelector(cleanedSelector); notify( - `info`, - t("browser_window.attribute_modal.notifications.list_select_success") + `info`, + t( + "browser_window.attribute_modal.notifications.list_select_success" + ) ); setCurrentListId(Date.now()); setFields({}); + } else if (getList === true && listSelector && currentListId) { + const attribute = options[0].value; + const data = + attribute === "href" + ? highlighterData.elementInfo?.url || "" + : attribute === "src" + ? highlighterData.elementInfo?.imageUrl || "" + : highlighterData.elementInfo?.innerText || ""; - socket?.emit("setGetList", { getList: true }); - socket?.emit("listSelector", { selector: cleanedSelector }); - } else if (getList === true && listSelector && currentListId) { if (options.length === 1) { - const attribute = options[0].value; - let currentSelector = highlighterData.selector; - - if (currentSelector.includes(">")) { - const [firstPart, ...restParts] = currentSelector - .split(">") - .map((p) => p.trim()); - const listSelectorRightPart = listSelector - .split(">") - .pop() - ?.trim() - .replace(/:nth-child\(\d+\)/g, ""); - - if ( - firstPart.includes("nth-child") && - firstPart.replace(/:nth-child\(\d+\)/g, "") === - listSelectorRightPart - ) { - currentSelector = `${firstPart.replace( - /:nth-child\(\d+\)/g, - "" - )} > ${restParts.join(" > ")}`; - } - } + let currentSelector = highlighterData.selector; + + if (currentSelector.includes("/")) { + const xpathParts = currentSelector + .split("/") + .filter((part) => part); + const cleanedParts = xpathParts.map((part) => { + return part.replace(/\[\d+\]/g, ""); + }); - const data = - attribute === "href" - ? highlighterData.elementInfo?.url || "" - : attribute === "src" - ? highlighterData.elementInfo?.imageUrl || "" - : highlighterData.elementInfo?.innerText || ""; - - const newField: TextStep = { - id: Date.now(), - type: "text", - label: `Label ${Object.keys(fields).length + 1}`, - data: data, - selectorObj: { - selector: currentSelector, - tag: highlighterData.elementInfo?.tagName, - shadow: highlighterData.elementInfo?.isShadowRoot, - attribute, - }, - }; - - const updatedFields = { - ...fields, - [newField.id]: newField, - }; - - setFields(updatedFields); - - if (listSelector) { - addListStep( - listSelector, - updatedFields, - currentListId, - currentListActionId || `list-${crypto.randomUUID()}`, - { type: "", selector: paginationSelector } - ); + if (cleanedParts.length > 0) { + currentSelector = "//" + cleanedParts.join("/"); } + } + + const newField: TextStep = { + id: Date.now(), + type: "text", + label: `Label ${Object.keys(fields).length + 1}`, + data: data, + selectorObj: { + selector: currentSelector, + tag: highlighterData.elementInfo?.tagName, + shadow: highlighterData.elementInfo?.isShadowRoot, + attribute, + }, + }; + + const updatedFields = { + ...fields, + [newField.id]: newField, + }; + + setFields(updatedFields); + + if (listSelector) { + addListStep( + listSelector, + updatedFields, + currentListId, + currentListActionId || `list-${crypto.randomUUID()}`, + { type: "", selector: paginationSelector } + ); + } } else { - setAttributeOptions(options); - setSelectedElement({ + setAttributeOptions(options); + setSelectedElement({ selector: highlighterData.selector, info: highlighterData.elementInfo, - }); - setShowAttributeModal(true); - } - } - }, - [ - getText, - getList, - listSelector, - paginationMode, - paginationType, - fields, - currentListId, - currentTextActionId, - currentListActionId, - addTextStep, - addListStep, - notify, - socket, - t, - paginationSelector, - ] - ); - - - const handleClick = (e: React.MouseEvent) => { - if (highlighterData && canvasRef?.current) { - const canvasRect = canvasRef.current.getBoundingClientRect(); - const clickX = e.clientX - canvasRect.left; - const clickY = e.clientY - canvasRect.top; - - const highlightRect = highlighterData.rect; - - const mappedRect = coordinateMapper.mapBrowserRectToCanvas(highlightRect); - if ( - clickX >= mappedRect.left && - clickX <= mappedRect.right && - clickY >= mappedRect.top && - clickY <= mappedRect.bottom - ) { - - const options = getAttributeOptions(highlighterData.elementInfo?.tagName || '', highlighterData.elementInfo); - - if (getText === true) { - if (options.length === 1) { - // Directly use the available attribute if only one option is present - const attribute = options[0].value; - const data = attribute === 'href' ? highlighterData.elementInfo?.url || '' : - attribute === 'src' ? highlighterData.elementInfo?.imageUrl || '' : - highlighterData.elementInfo?.innerText || ''; - - addTextStep('', data, { - selector: highlighterData.selector, - tag: highlighterData.elementInfo?.tagName, - shadow: highlighterData.elementInfo?.isShadowRoot, - attribute, - }, currentTextActionId || `text-${crypto.randomUUID()}`); - } else { - // Show the modal if there are multiple options - setAttributeOptions(options); - setSelectedElement({ - selector: highlighterData.selector, - info: highlighterData.elementInfo, - }); - setShowAttributeModal(true); - } - } - - if (paginationMode && getList) { - // Only allow selection in pagination mode if type is not empty, 'scrollDown', or 'scrollUp' - if (paginationType !== '' && paginationType !== 'scrollDown' && paginationType !== 'scrollUp' && paginationType !== 'none') { - setPaginationSelector(highlighterData.selector); - notify(`info`, t('browser_window.attribute_modal.notifications.pagination_select_success')); - addListStep(listSelector!, fields, currentListId || 0, currentListActionId || `list-${crypto.randomUUID()}`, { type: paginationType, selector: highlighterData.selector }); - socket?.emit('setPaginationMode', { pagination: false }); - } - return; - } - - if (getList === true && !listSelector) { - let cleanedSelector = highlighterData.selector; - if (cleanedSelector.includes('nth-child')) { - cleanedSelector = cleanedSelector.replace(/:nth-child\(\d+\)/g, ''); - } - - setListSelector(cleanedSelector); - notify(`info`, t('browser_window.attribute_modal.notifications.list_select_success')); - setCurrentListId(Date.now()); - setFields({}); - } else if (getList === true && listSelector && currentListId) { - const attribute = options[0].value; - const data = attribute === 'href' ? highlighterData.elementInfo?.url || '' : - attribute === 'src' ? highlighterData.elementInfo?.imageUrl || '' : - highlighterData.elementInfo?.innerText || ''; - // Add fields to the list - if (options.length === 1) { - const attribute = options[0].value; - let currentSelector = highlighterData.selector; - - if (currentSelector.includes('>')) { - const [firstPart, ...restParts] = currentSelector.split('>').map(p => p.trim()); - const listSelectorRightPart = listSelector.split('>').pop()?.trim().replace(/:nth-child\(\d+\)/g, ''); - - if (firstPart.includes('nth-child') && - firstPart.replace(/:nth-child\(\d+\)/g, '') === listSelectorRightPart) { - currentSelector = `${firstPart.replace(/:nth-child\(\d+\)/g, '')} > ${restParts.join(' > ')}`; - } - } - - const newField: TextStep = { - id: Date.now(), - type: 'text', - label: `Label ${Object.keys(fields).length + 1}`, - data: data, - selectorObj: { - selector: currentSelector, - tag: highlighterData.elementInfo?.tagName, - shadow: highlighterData.elementInfo?.isShadowRoot, - attribute - } - }; - - const updatedFields = { - ...fields, - [newField.id]: newField - }; - - setFields(updatedFields); - - if (listSelector) { - addListStep( - listSelector, - updatedFields, - currentListId, - currentListActionId || `list-${crypto.randomUUID()}`, - { type: '', selector: paginationSelector } - ); - } - - } else { - setAttributeOptions(options); - setSelectedElement({ - selector: highlighterData.selector, - info: highlighterData.elementInfo - }); - setShowAttributeModal(true); - } - } + }); + setShowAttributeModal(true); } + } } + } }; const handleAttributeSelection = (attribute: string) => { @@ -1149,31 +1207,88 @@ export const BrowserWindow = () => { )} {isDOMMode && highlighterData && ( - <> -
- - )} + <> + {/* Individual element highlight (for non-group or hovered element) */} + {(!getList || + listSelector || + !currentGroupInfo?.isGroupElement) && ( +
+ )} + + {/* Group elements highlighting with real-time coordinates */} + {getList && + !listSelector && + currentGroupInfo?.isGroupElement && + highlighterData.groupElements && + highlighterData.groupElements.map((groupElement, index) => ( + + {/* Highlight box */} +
+ +
+ List item {index + 1} +
+ + ))} + + )} )} @@ -1186,6 +1301,7 @@ export const BrowserWindow = () => { getList={getList} getText={getText} listSelector={listSelector} + cachedChildSelectors={cachedChildSelectors} paginationMode={paginationMode} paginationType={paginationType} limitMode={limitMode} diff --git a/src/components/recorder/DOMBrowserRenderer.tsx b/src/components/recorder/DOMBrowserRenderer.tsx index 03319ac30..60849c87f 100644 --- a/src/components/recorder/DOMBrowserRenderer.tsx +++ b/src/components/recorder/DOMBrowserRenderer.tsx @@ -98,6 +98,7 @@ interface RRWebDOMBrowserRendererProps { getList?: boolean; getText?: boolean; listSelector?: string | null; + cachedChildSelectors?: string[]; paginationMode?: boolean; paginationType?: string; limitMode?: boolean; @@ -106,12 +107,14 @@ interface RRWebDOMBrowserRendererProps { selector: string; elementInfo: ElementInfo | null; childSelectors?: string[]; + groupInfo?: any; }) => void; onElementSelect?: (data: { rect: DOMRect; selector: string; elementInfo: ElementInfo | null; childSelectors?: string[]; + groupInfo?: any; }) => void; onShowDatePicker?: (info: { coordinates: { x: number; y: number }; @@ -144,6 +147,7 @@ export const DOMBrowserRenderer: React.FC = ({ getList = false, getText = false, listSelector = null, + cachedChildSelectors = [], paginationMode = false, paginationType = "", limitMode = false, @@ -205,11 +209,24 @@ export const DOMBrowserRenderer: React.FC = ({ const handleDOMHighlighting = useCallback( (x: number, y: number, iframeDoc: Document) => { try { + if (!getText && !getList) { + setCurrentHighlight(null); + if (onHighlight) { + onHighlight({ + rect: new DOMRect(0, 0, 0, 0), + selector: "", + elementInfo: null, + }); + } + return; + } + const highlighterData = clientSelectorGenerator.generateDataForHighlighter( { x, y }, iframeDoc, - true + true, + cachedChildSelectors ); if (!highlighterData) { @@ -224,70 +241,40 @@ export const DOMBrowserRenderer: React.FC = ({ return; } - const { rect, selector, elementInfo, childSelectors } = highlighterData; + const { rect, selector, elementInfo, childSelectors, groupInfo } = + highlighterData; let shouldHighlight = false; if (getList) { - if (listSelector) { - const hasValidChildSelectors = - Array.isArray(childSelectors) && childSelectors.length > 0; - + // First phase: Allow any group to be highlighted for selection + if (!listSelector && groupInfo?.isGroupElement) { + shouldHighlight = true; + } + // Second phase: Show valid children within selected group + else if (listSelector) { if (limitMode) { shouldHighlight = false; - } else if (paginationMode) { - if ( - paginationType !== "" && - !["none", "scrollDown", "scrollUp"].includes(paginationType) - ) { - shouldHighlight = true; - } else { - shouldHighlight = false; - } - } else if (childSelectors && childSelectors.includes(selector)) { + } else if ( + paginationMode && + paginationType !== "" && + !["none", "scrollDown", "scrollUp"].includes(paginationType) + ) { + shouldHighlight = true; + } else if (childSelectors && childSelectors.length > 0) { + console.log("✅ Child selectors present, highlighting enabled"); shouldHighlight = true; - } else if (elementInfo?.isIframeContent && childSelectors) { - const isIframeChild = childSelectors.some( - (childSelector: string) => - selector.includes(":>>") && - childSelector - .split(":>>") - .some((part) => selector.includes(part.trim())) - ); - shouldHighlight = isIframeChild; - } else if (selector.includes(":>>") && hasValidChildSelectors) { - const selectorParts = selector - .split(":>>") - .map((part: string) => part.trim()); - const isValidMixedSelector = selectorParts.some((part: any) => - childSelectors!.some((childSelector) => - childSelector.includes(part) - ) - ); - } else if (elementInfo?.isShadowRoot && childSelectors) { - const isShadowChild = childSelectors.some( - (childSelector: string) => - selector.includes(">>") && - childSelector - .split(">>") - .some((part) => selector.includes(part.trim())) - ); - } else if (selector.includes(">>") && hasValidChildSelectors) { - const selectorParts = selector - .split(">>") - .map((part: string) => part.trim()); - const isValidMixedSelector = selectorParts.some((part: any) => - childSelectors!.some((childSelector) => - childSelector.includes(part) - ) - ); } else { + console.log("❌ No child selectors available"); shouldHighlight = false; } - } else { + } + // No list selector - show regular highlighting + else { shouldHighlight = true; } } else { + // getText mode - always highlight shouldHighlight = true; } @@ -316,6 +303,7 @@ export const DOMBrowserRenderer: React.FC = ({ }, selector, childSelectors, + groupInfo, }); } } @@ -335,9 +323,11 @@ export const DOMBrowserRenderer: React.FC = ({ } }, [ + getText, getList, listSelector, paginationMode, + cachedChildSelectors, paginationType, limitMode, onHighlight, @@ -363,6 +353,10 @@ export const DOMBrowserRenderer: React.FC = ({ return; } + if (!isInCaptureMode) { + return; + } + const now = performance.now(); if (now - lastMouseMoveTime.current < MOUSE_MOVE_THROTTLE) { return; @@ -401,11 +395,24 @@ export const DOMBrowserRenderer: React.FC = ({ e.stopPropagation(); if (currentHighlight && onElementSelect) { + // Get the group info for the current highlight + const highlighterData = + clientSelectorGenerator.generateDataForHighlighter( + { x: iframeX, y: iframeY }, + iframeDoc, + true, + cachedChildSelectors + ); + onElementSelect({ rect: currentHighlight.rect, selector: currentHighlight.selector, elementInfo: currentHighlight.elementInfo, - childSelectors: currentHighlight.childSelectors || [], + childSelectors: + cachedChildSelectors.length > 0 + ? cachedChildSelectors + : highlighterData?.childSelectors || [], + groupInfo: highlighterData?.groupInfo, }); } notifyLastAction("select element"); @@ -790,12 +797,41 @@ export const DOMBrowserRenderer: React.FC = ({ rebuiltHTML = "\n" + rebuiltHTML; + const additionalCSS = []; + + if (snapshotData.resources.fonts?.length > 0) { + const fontCSS = snapshotData.resources.fonts + .map((font) => { + const format = font.format || "woff2"; + return ` + @font-face { + font-family: 'ProxiedFont-${ + font.url.split("/").pop()?.split(".")[0] || "unknown" + }'; + src: url("${font.dataUrl}") format("${format}"); + font-display: swap; + } + `; + }) + .join("\n"); + additionalCSS.push(fontCSS); + } + + if (snapshotData.resources.stylesheets?.length > 0) { + const externalCSS = snapshotData.resources.stylesheets + .map((stylesheet) => stylesheet.content) + .join("\n\n"); + additionalCSS.push(externalCSS); + } + const enhancedCSS = ` /* rrweb rebuilt content styles */ html, body { - margin: 0 !important; - padding: 8px !important; - overflow-x: hidden !important; + margin: 0 !important; + padding: 8px !important; + font-family: system-ui, -apple-system, BlinkMacSystemFont, sans-serif !important; + background: white !important; + overflow-x: hidden !important; } html::-webkit-scrollbar, @@ -818,12 +854,22 @@ export const DOMBrowserRenderer: React.FC = ({ scrollbar-width: none !important; /* Firefox */ -ms-overflow-style: none !important; /* Internet Explorer 10+ */ } + + img { + max-width: 100% !important; + height: auto !important; + } + /* Make everything interactive */ * { cursor: "pointer" !important; } - `; + + /* Additional CSS from resources */ + ${additionalCSS.join("\n\n")} + `; + const headTagRegex = /]*>/i; const cssInjection = ` diff --git a/src/components/recorder/RightSidePanel.tsx b/src/components/recorder/RightSidePanel.tsx index 3fd3fcf6c..ba6c63461 100644 --- a/src/components/recorder/RightSidePanel.tsx +++ b/src/components/recorder/RightSidePanel.tsx @@ -22,6 +22,7 @@ import { useThemeMode } from '../../context/theme-provider'; import { useTranslation } from 'react-i18next'; import { useBrowserDimensionsStore } from '../../context/browserDimensions'; import { clientListExtractor } from '../../helpers/clientListExtractor'; +import { clientSelectorGenerator } from '../../helpers/clientSelectorGenerator'; const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => { getActiveWorkflow(id).then( @@ -52,10 +53,8 @@ export const RightSidePanel: React.FC = ({ onFinishCapture const [isCaptureTextConfirmed, setIsCaptureTextConfirmed] = useState(false); const [isCaptureListConfirmed, setIsCaptureListConfirmed] = useState(false); const { panelHeight } = useBrowserDimensionsStore(); - const [isDOMMode, setIsDOMMode] = useState(false); - const [currentSnapshot, setCurrentSnapshot] = useState(null); - const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId } = useGlobalInfoStore(); + const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId, updateDOMMode, currentSnapshot, isDOMMode } = useGlobalInfoStore(); const { getText, startGetText, stopGetText, getList, startGetList, stopGetList, @@ -86,22 +85,20 @@ export const RightSidePanel: React.FC = ({ onFinishCapture if (socket) { const domModeHandler = (data: any) => { if (!data.userId || data.userId === id) { - setIsDOMMode(true); + updateDOMMode(true); } }; const screenshotModeHandler = (data: any) => { if (!data.userId || data.userId === id) { - setIsDOMMode(false); - setCurrentSnapshot(null); + updateDOMMode(false); } }; const domcastHandler = (data: any) => { if (!data.userId || data.userId === id) { if (data.snapshotData && data.snapshotData.snapshot) { - setCurrentSnapshot(data.snapshotData); - setIsDOMMode(true); + updateDOMMode(true, data.snapshotData); } } }; @@ -116,7 +113,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture socket.off("domcast", domcastHandler); }; } - }, [socket, id]); + }, [socket, id, updateDOMMode]); useEffect(() => { if (socket) { @@ -214,7 +211,6 @@ export const RightSidePanel: React.FC = ({ onFinishCapture ) => { if (isDOMMode && currentSnapshot) { try { - // Find the DOM iframe element let iframeElement = document.querySelector( "#dom-browser-iframe" ) as HTMLIFrameElement; @@ -247,22 +243,42 @@ export const RightSidePanel: React.FC = ({ onFinishCapture return; } - // Use client-side extraction + Object.entries(fields).forEach(([key, field]) => { + if (field.selectorObj?.selector) { + const isFieldXPath = + field.selectorObj.selector.startsWith("//") || + field.selectorObj.selector.startsWith("/"); + console.log( + `Field "${key}" selector:`, + field.selectorObj.selector, + `(XPath: ${isFieldXPath})` + ); + } + }); + const extractedData = clientListExtractor.extractListData( iframeDoc, listSelector, fields, - 5 // limit for preview + 5 ); updateListStepData(currentListId, extractedData); - console.log("✅ UI extraction completed:"); + + if (extractedData.length === 0) { + console.warn( + "⚠️ No data extracted - this might indicate selector issues" + ); + notify( + "warning", + "No data was extracted. Please verify your selections." + ); + } } catch (error) { console.error("Error in client-side data extraction:", error); notify("error", "Failed to extract data client-side"); } } else { - // Fallback to socket-based extraction for screenshot mode if (!socket) { console.error("Socket not available for backend extraction"); return; @@ -275,8 +291,6 @@ export const RightSidePanel: React.FC = ({ onFinishCapture currentListId, pagination: { type: "", selector: "" }, }); - - console.log("📤 Sent extraction request to server"); } catch (error) { console.error("Error in backend data extraction:", error); } @@ -443,6 +457,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture resetInterpretationLog(); finishAction('text'); onFinishCapture(); + clientSelectorGenerator.cleanup(); }, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps, resetInterpretationLog, finishAction, notify, onFinishCapture, t]); const getListSettingsObject = useCallback(() => { @@ -494,6 +509,8 @@ export const RightSidePanel: React.FC = ({ onFinishCapture const stopCaptureAndEmitGetListSettings = useCallback(() => { const settings = getListSettingsObject(); + + console.log("rrwebSnapshotHandler", settings); const latestListStep = getLatestListStep(browserSteps); if (latestListStep && settings) { @@ -509,6 +526,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture resetInterpretationLog(); finishAction('list'); onFinishCapture(); + clientSelectorGenerator.cleanup(); }, [getListSettingsObject, socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide]); const hasUnconfirmedListTextFields = browserSteps.some(step => @@ -638,6 +656,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture setCurrentTextActionId(''); setIsCaptureTextConfirmed(false); + clientSelectorGenerator.cleanup(); notify('error', t('right_panel.errors.capture_text_discarded')); }, [currentTextActionId, browserSteps, stopGetText, deleteStepsByActionId, notify, t]); @@ -668,6 +687,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture setCaptureStage('initial'); setCurrentListActionId(''); setIsCaptureListConfirmed(false); + clientSelectorGenerator.cleanup(); notify('error', t('right_panel.errors.capture_list_discarded')); }, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t]); @@ -686,6 +706,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture stopGetScreenshot(); resetInterpretationLog(); finishAction('screenshot'); + clientSelectorGenerator.cleanup(); onFinishCapture(); }; diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx index 55f96b068..6f3cf8cd6 100644 --- a/src/context/globalInfo.tsx +++ b/src/context/globalInfo.tsx @@ -27,6 +27,41 @@ interface ScheduleConfig { cronExpression?: string; } +interface ProcessedSnapshot { + snapshot: any; + resources: { + stylesheets: Array<{ + href: string; + content: string; + media?: string; + }>; + images: Array<{ + src: string; + dataUrl: string; + alt?: string; + }>; + fonts: Array<{ + url: string; + dataUrl: string; + format?: string; + }>; + scripts: Array<{ + src: string; + content: string; + type?: string; + }>; + media: Array<{ + src: string; + dataUrl: string; + type: string; + }>; + }; + baseUrl: string; + viewport: { width: number; height: number }; + timestamp: number; + processingStats: any; +} + export interface RobotSettings { id: string; userId?: number; @@ -86,6 +121,11 @@ interface GlobalInfo { setCurrentListActionId: (actionId: string) => void; currentScreenshotActionId: string; setCurrentScreenshotActionId: (actionId: string) => void; + isDOMMode: boolean; + setIsDOMMode: (isDOMMode: boolean) => void; + currentSnapshot: ProcessedSnapshot | null; + setCurrentSnapshot: (snapshot: ProcessedSnapshot | null) => void; + updateDOMMode: (isDOMMode: boolean, snapshot?: ProcessedSnapshot | null) => void; }; class GlobalInfoStore implements Partial { @@ -115,6 +155,8 @@ class GlobalInfoStore implements Partial { currentTextActionId = ''; currentListActionId = ''; currentScreenshotActionId = ''; + isDOMMode = false; + currentSnapshot = null; }; const globalInfoStore = new GlobalInfoStore(); @@ -141,6 +183,8 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => { const [currentTextActionId, setCurrentTextActionId] = useState(''); const [currentListActionId, setCurrentListActionId] = useState(''); const [currentScreenshotActionId, setCurrentScreenshotActionId] = useState(''); + const [isDOMMode, setIsDOMMode] = useState(globalInfoStore.isDOMMode); + const [currentSnapshot, setCurrentSnapshot] = useState(globalInfoStore.currentSnapshot); const notify = (severity: 'error' | 'warning' | 'info' | 'success', message: string) => { setNotification({ severity, message, isOpen: true }); @@ -165,6 +209,18 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => { }, 100); } + const updateDOMMode = (mode: boolean, snapshot?: ProcessedSnapshot | null) => { + setIsDOMMode(mode); + + if (snapshot !== undefined) { + setCurrentSnapshot(snapshot); + } + + if (!mode) { + setCurrentSnapshot(null); + } + } + return ( { setCurrentListActionId, currentScreenshotActionId, setCurrentScreenshotActionId, + isDOMMode, + setIsDOMMode, + currentSnapshot, + setCurrentSnapshot, + updateDOMMode, }} > {children} diff --git a/src/helpers/clientListExtractor.ts b/src/helpers/clientListExtractor.ts index c7b21fd32..790abdea2 100644 --- a/src/helpers/clientListExtractor.ts +++ b/src/helpers/clientListExtractor.ts @@ -15,30 +15,89 @@ interface ExtractedListData { [key: string]: string; } -interface TableField { +interface Field { selector: string; attribute: string; - tableContext?: string; - cellIndex?: number; } -interface NonTableField { - selector: string; - attribute: string; -} +class ClientListExtractor { + private evaluateXPath = ( + rootElement: Element | Document, + xpath: string + ): Element | null => { + try { + const ownerDoc = + rootElement.nodeType === Node.DOCUMENT_NODE + ? (rootElement as Document) + : rootElement.ownerDocument; + + if (!ownerDoc) return null; + + const result = ownerDoc.evaluate( + xpath, + rootElement, + null, + XPathResult.FIRST_ORDERED_NODE_TYPE, + null + ); -interface ContainerFields { - tableFields: Record; - nonTableFields: Record; -} + return result.singleNodeValue as Element | null; + } catch (error) { + console.warn("XPath evaluation failed:", xpath, error); + return null; + } + }; + + private evaluateXPathAll = ( + rootElement: Element | Document, + xpath: string + ): Element[] => { + try { + const ownerDoc = + rootElement.nodeType === Node.DOCUMENT_NODE + ? (rootElement as Document) + : rootElement.ownerDocument; + + if (!ownerDoc) return []; + + const result = ownerDoc.evaluate( + xpath, + rootElement, + null, + XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, + null + ); + + const elements: Element[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as Element); + } + } + + return elements; + } catch (error) { + console.warn("XPath evaluation failed:", xpath, error); + return []; + } + }; -class ClientListExtractor { private queryElement = ( rootElement: Element | Document, selector: string ): Element | null => { if (!selector.includes(">>") && !selector.includes(":>>")) { - return rootElement.querySelector(selector); + // Check if it's an XPath selector (starts with // or / or ./) + if ( + selector.startsWith("//") || + selector.startsWith("/") || + selector.startsWith("./") + ) { + return this.evaluateXPath(rootElement, selector); + } else { + return rootElement.querySelector(selector); + } } const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); @@ -59,7 +118,17 @@ class ClientListExtractor { frameElement.contentDocument || frameElement.contentWindow?.document; if (!frameDoc) return null; - currentElement = frameDoc.querySelector(parts[i]); + + // Handle XPath in iframe context + if ( + parts[i].startsWith("//") || + parts[i].startsWith("/") || + parts[i].startsWith("./") + ) { + currentElement = this.evaluateXPath(frameDoc, parts[i]); + } else { + currentElement = frameDoc.querySelector(parts[i]); + } continue; } catch (e) { console.warn( @@ -75,7 +144,16 @@ class ClientListExtractor { let nextElement: Element | null = null; if ("querySelector" in currentElement) { - nextElement = currentElement.querySelector(parts[i]); + // Handle XPath vs CSS selector + if ( + parts[i].startsWith("//") || + parts[i].startsWith("/") || + parts[i].startsWith("./") + ) { + nextElement = this.evaluateXPath(currentElement, parts[i]); + } else { + nextElement = currentElement.querySelector(parts[i]); + } } if ( @@ -83,9 +161,20 @@ class ClientListExtractor { "shadowRoot" in currentElement && (currentElement as Element).shadowRoot ) { - nextElement = (currentElement as Element).shadowRoot!.querySelector( - parts[i] - ); + if ( + parts[i].startsWith("//") || + parts[i].startsWith("/") || + parts[i].startsWith("./") + ) { + nextElement = this.evaluateXPath( + (currentElement as Element).shadowRoot as unknown as Document, + parts[i] + ); + } else { + nextElement = (currentElement as Element).shadowRoot!.querySelector( + parts[i] + ); + } } if (!nextElement && "children" in currentElement) { @@ -94,7 +183,18 @@ class ClientListExtractor { ); for (const child of children) { if (child.shadowRoot) { - nextElement = child.shadowRoot.querySelector(parts[i]); + if ( + parts[i].startsWith("//") || + parts[i].startsWith("/") || + parts[i].startsWith("./") + ) { + nextElement = this.evaluateXPath( + child.shadowRoot as unknown as Document, + parts[i] + ); + } else { + nextElement = child.shadowRoot.querySelector(parts[i]); + } if (nextElement) break; } } @@ -111,7 +211,12 @@ class ClientListExtractor { selector: string ): Element[] => { if (!selector.includes(">>") && !selector.includes(":>>")) { - return Array.from(rootElement.querySelectorAll(selector)); + // Check if it's an XPath selector (starts with // or /) + if (selector.startsWith("//") || selector.startsWith("/")) { + return this.evaluateXPathAll(rootElement, selector); + } else { + return Array.from(rootElement.querySelectorAll(selector)); + } } const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); @@ -133,7 +238,14 @@ class ClientListExtractor { frameElement.contentDocument || frameElement.contentWindow?.document; if (frameDoc) { - nextElements.push(...Array.from(frameDoc.querySelectorAll(part))); + // Handle XPath in iframe context + if (part.startsWith("//") || part.startsWith("/")) { + nextElements.push(...this.evaluateXPathAll(frameDoc, part)); + } else { + nextElements.push( + ...Array.from(frameDoc.querySelectorAll(part)) + ); + } } } catch (e) { console.warn( @@ -146,24 +258,47 @@ class ClientListExtractor { } } else { if ("querySelectorAll" in element) { - nextElements.push(...Array.from(element.querySelectorAll(part))); + // Handle XPath vs CSS selector + if (part.startsWith("//") || part.startsWith("/")) { + nextElements.push(...this.evaluateXPathAll(element, part)); + } else { + nextElements.push(...Array.from(element.querySelectorAll(part))); + } } if ("shadowRoot" in element && (element as Element).shadowRoot) { - nextElements.push( - ...Array.from( - (element as Element).shadowRoot!.querySelectorAll(part) - ) - ); + if (part.startsWith("//") || part.startsWith("/")) { + nextElements.push( + ...this.evaluateXPathAll( + (element as Element).shadowRoot as unknown as Document, + part + ) + ); + } else { + nextElements.push( + ...Array.from( + (element as Element).shadowRoot!.querySelectorAll(part) + ) + ); + } } if ("children" in element) { const children = Array.from((element as Element).children || []); for (const child of children) { if (child.shadowRoot) { - nextElements.push( - ...Array.from(child.shadowRoot.querySelectorAll(part)) - ); + if (part.startsWith("//") || part.startsWith("/")) { + nextElements.push( + ...this.evaluateXPathAll( + child.shadowRoot as unknown as Document, + part + ) + ); + } else { + nextElements.push( + ...Array.from(child.shadowRoot.querySelectorAll(part)) + ); + } } } } @@ -193,35 +328,66 @@ class ClientListExtractor { } if (attribute === "innerText") { - return (element as HTMLElement).innerText?.trim() || null; - } else if (attribute === "innerHTML") { - return element.innerHTML?.trim() || null; - } else if (attribute === "src" || attribute === "href") { - if (attribute === "href" && element.tagName !== "A") { - const parentElement = element.parentElement; - if (parentElement && parentElement.tagName === "A") { - const parentHref = parentElement.getAttribute("href"); - if (parentHref) { - try { - return new URL(parentHref, baseURL).href; - } catch (e) { - return parentHref; - } + // First try standard innerText/textContent + let textContent = + (element as HTMLElement).innerText?.trim() || + (element as HTMLElement).textContent?.trim(); + + // If empty, check for common data attributes that might contain the text + if (!textContent) { + // Check for data-* attributes that commonly contain text values + const dataAttributes = [ + "data-600", + "data-text", + "data-label", + "data-value", + "data-content", + ]; + for (const attr of dataAttributes) { + const dataValue = element.getAttribute(attr); + if (dataValue && dataValue.trim()) { + textContent = dataValue.trim(); + break; } } } + return textContent || null; + } else if (attribute === "innerHTML") { + return element.innerHTML?.trim() || null; + } else if (attribute === "href") { + // For href, we need to find the anchor tag if the current element isn't one + let anchorElement = element; + + // If current element is not an anchor, look for parent anchor + if (element.tagName !== "A") { + anchorElement = + element.closest("a") || + element.parentElement?.closest("a") || + element; + } + + const hrefValue = anchorElement.getAttribute("href"); + if (!hrefValue || hrefValue.trim() === "") { + return null; + } + + try { + return new URL(hrefValue, baseURL).href; + } catch (e) { + console.warn("Error creating URL from", hrefValue, e); + return hrefValue; + } + } else if (attribute === "src") { const attrValue = element.getAttribute(attribute); const dataAttr = attrValue || element.getAttribute("data-" + attribute); if (!dataAttr || dataAttr.trim() === "") { - if (attribute === "src") { - const style = window.getComputedStyle(element as HTMLElement); - const bgImage = style.backgroundImage; - if (bgImage && bgImage !== "none") { - const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); - return matches ? new URL(matches[1], baseURL).href : null; - } + const style = window.getComputedStyle(element as HTMLElement); + const bgImage = style.backgroundImage; + if (bgImage && bgImage !== "none") { + const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); + return matches ? new URL(matches[1], baseURL).href : null; } return null; } @@ -236,187 +402,8 @@ class ClientListExtractor { return element.getAttribute(attribute); }; - private findTableAncestor = ( - element: Element - ): { type: string; element: Element } | null => { - let currentElement: Element | null = element; - const MAX_DEPTH = 5; - let depth = 0; - - while (currentElement && depth < MAX_DEPTH) { - if (currentElement.getRootNode() instanceof ShadowRoot) { - currentElement = (currentElement.getRootNode() as ShadowRoot).host; - continue; - } - - if (currentElement.tagName === "TD") { - return { type: "TD", element: currentElement }; - } else if (currentElement.tagName === "TR") { - return { type: "TR", element: currentElement }; - } - - if ( - currentElement.tagName === "IFRAME" || - currentElement.tagName === "FRAME" - ) { - try { - const frameElement = currentElement as - | HTMLIFrameElement - | HTMLFrameElement; - currentElement = frameElement.contentDocument?.body || null; - } catch (e) { - return null; - } - } else { - currentElement = currentElement.parentElement; - } - depth++; - } - return null; - }; - - private getCellIndex = (td: Element): number => { - if (td.getRootNode() instanceof ShadowRoot) { - const shadowRoot = td.getRootNode() as ShadowRoot; - const allCells = Array.from(shadowRoot.querySelectorAll("td")); - return allCells.indexOf(td as HTMLTableCellElement); - } - - let index = 0; - let sibling = td; - while ((sibling = sibling.previousElementSibling as Element)) { - index++; - } - return index; - }; - - private hasThElement = ( - row: Element, - tableFields: Record - ): boolean => { - for (const [_, { selector }] of Object.entries(tableFields)) { - const element = this.queryElement(row, selector); - if (element) { - let current: Element | ShadowRoot | Document | null = element; - while (current && current !== row) { - if (current.getRootNode() instanceof ShadowRoot) { - current = (current.getRootNode() as ShadowRoot).host; - continue; - } - - if ((current as Element).tagName === "TH") return true; - - if ( - (current as Element).tagName === "IFRAME" || - (current as Element).tagName === "FRAME" - ) { - try { - const frameElement = current as - | HTMLIFrameElement - | HTMLFrameElement; - current = frameElement.contentDocument?.body || null; - } catch (e) { - break; - } - } else { - current = (current as Element).parentElement; - } - } - } - } - return false; - }; - - private filterRowsBasedOnTag = ( - rows: Element[], - tableFields: Record - ): Element[] => { - for (const row of rows) { - if (this.hasThElement(row, tableFields)) { - return rows; - } - } - return rows.filter((row) => { - const directTH = row.getElementsByTagName("TH").length === 0; - const shadowTH = row.shadowRoot - ? row.shadowRoot.querySelector("th") === null - : true; - return directTH && shadowTH; - }); - }; - - private calculateClassSimilarity = ( - classList1: string[], - classList2: string[] - ): number => { - const set1 = new Set(classList1); - const set2 = new Set(classList2); - const intersection = new Set([...set1].filter((x) => set2.has(x))); - const union = new Set([...set1, ...set2]); - return intersection.size / union.size; - }; - - private findSimilarElements = ( - baseElement: Element, - document: Document, - similarityThreshold: number = 0.7 - ): Element[] => { - const baseClasses = Array.from(baseElement.classList); - if (baseClasses.length === 0) return []; - - const allElements: Element[] = []; - - allElements.push( - ...Array.from(document.getElementsByTagName(baseElement.tagName)) - ); - - if (baseElement.getRootNode() instanceof ShadowRoot) { - const shadowHost = (baseElement.getRootNode() as ShadowRoot).host; - allElements.push( - ...Array.from(shadowHost.getElementsByTagName(baseElement.tagName)) - ); - } - - const frames = [ - ...Array.from(document.getElementsByTagName("iframe")), - ...Array.from(document.getElementsByTagName("frame")), - ]; - - for (const frame of frames) { - try { - const frameElement = frame as HTMLIFrameElement | HTMLFrameElement; - const frameDoc = - frameElement.contentDocument || frameElement.contentWindow?.document; - if (frameDoc) { - allElements.push( - ...Array.from(frameDoc.getElementsByTagName(baseElement.tagName)) - ); - } - } catch (e) { - console.warn( - `Cannot access ${frame.tagName.toLowerCase()} content:`, - e - ); - } - } - - return allElements.filter((element) => { - if (element === baseElement) return false; - const similarity = this.calculateClassSimilarity( - baseClasses, - Array.from(element.classList) - ); - return similarity >= similarityThreshold; - }); - }; - - private convertFields = ( - fields: any - ): Record => { - const convertedFields: Record< - string, - { selector: string; attribute: string } - > = {}; + private convertFields = (fields: any): Record => { + const convertedFields: Record = {}; for (const [key, field] of Object.entries(fields)) { const typedField = field as TextStep; @@ -439,283 +426,132 @@ class ClientListExtractor { // Convert fields to the format expected by the extraction logic const convertedFields = this.convertFields(fields); - // Get all container elements matching the list selector - let containers = this.queryElementAll(iframeDocument, listSelector); + // Step 1: Get all container elements matching the list selector + const containers = this.queryElementAll(iframeDocument, listSelector); if (containers.length === 0) { - console.warn("No containers found for listSelector:", listSelector); + console.warn("❌ No containers found for listSelector:", listSelector); return []; } - // Enhanced container discovery: find similar elements if we need more containers - if (limit > 1 && containers.length === 1) { - const baseContainer = containers[0]; - const similarContainers = this.findSimilarElements( - baseContainer, - iframeDocument, - 0.7 - ); - - if (similarContainers.length > 0) { - const newContainers = similarContainers.filter( - (container) => !container.matches(listSelector) - ); - containers = [...containers, ...newContainers]; - } - } - - // Analyze fields for table vs non-table context - const containerFields: ContainerFields[] = containers.map(() => ({ - tableFields: {}, - nonTableFields: {}, - })); - - containers.forEach((container, containerIndex) => { - for (const [label, field] of Object.entries(convertedFields)) { - const sampleElement = this.queryElement(container, field.selector); - - if (sampleElement) { - const ancestor = this.findTableAncestor(sampleElement); - if (ancestor) { - containerFields[containerIndex].tableFields[label] = { - ...field, - tableContext: ancestor.type, - cellIndex: - ancestor.type === "TD" - ? this.getCellIndex(ancestor.element) - : -1, - }; - } else { - containerFields[containerIndex].nonTableFields[label] = field; - } - } else { - containerFields[containerIndex].nonTableFields[label] = field; - } - } - }); + // Step 2: Extract data from each container up to the limit + const extractedData: ExtractedListData[] = []; + const containersToProcess = Math.min(containers.length, limit); - // Extract table data - const tableData: ExtractedListData[] = []; for ( let containerIndex = 0; - containerIndex < containers.length; + containerIndex < containersToProcess; containerIndex++ ) { const container = containers[containerIndex]; - const { tableFields } = containerFields[containerIndex]; - - if (Object.keys(tableFields).length > 0) { - const firstField = Object.values(tableFields)[0]; - const firstElement = this.queryElement( - container, - firstField.selector - ); - let tableContext: Element | null = firstElement; + const record: ExtractedListData = {}; + + // Step 3: For each field, extract data from the current container + for (const [label, { selector, attribute }] of Object.entries( + convertedFields + )) { + let element: Element | null = null; + + // CORRECT APPROACH: Create indexed absolute XPath + if (selector.startsWith("//")) { + // Convert the absolute selector to target the specific container instance + const indexedSelector = this.createIndexedXPath( + selector, + listSelector, + containerIndex + 1 + ); - // Find the table context - while ( - tableContext && - tableContext.tagName !== "TABLE" && - tableContext !== container - ) { - if (tableContext.getRootNode() instanceof ShadowRoot) { - tableContext = (tableContext.getRootNode() as ShadowRoot).host; - continue; - } + element = this.evaluateXPathSingle(iframeDocument, indexedSelector); + } else { + // Fallback for non-XPath selectors + element = this.queryElement(container, selector); + } - if ( - tableContext.tagName === "IFRAME" || - tableContext.tagName === "FRAME" - ) { - try { - const frameElement = tableContext as - | HTMLIFrameElement - | HTMLFrameElement; - tableContext = frameElement.contentDocument?.body || null; - } catch (e) { - break; - } + // Step 4: Extract the value from the found element + if (element) { + const value = this.extractValue(element, attribute); + if (value !== null && value !== "") { + record[label] = value; } else { - tableContext = tableContext.parentElement; + console.warn(` ⚠️ Empty value for "${label}"`); + record[label] = ""; } + } else { + console.warn(` ❌ Element not found for "${label}"`); + record[label] = ""; } + } - if (tableContext) { - const rows: Element[] = []; - rows.push(...Array.from(tableContext.getElementsByTagName("TR"))); - - if ( - tableContext.tagName === "IFRAME" || - tableContext.tagName === "FRAME" - ) { - try { - const frameElement = tableContext as - | HTMLIFrameElement - | HTMLFrameElement; - const frameDoc = - frameElement.contentDocument || - frameElement.contentWindow?.document; - if (frameDoc) { - rows.push(...Array.from(frameDoc.getElementsByTagName("TR"))); - } - } catch (e) { - console.warn( - `Cannot access ${tableContext.tagName.toLowerCase()} rows:`, - e - ); - } - } - - const processedRows = this.filterRowsBasedOnTag(rows, tableFields); - - for ( - let rowIndex = 0; - rowIndex < Math.min(processedRows.length, limit); - rowIndex++ - ) { - const record: ExtractedListData = {}; - const currentRow = processedRows[rowIndex]; - - for (const [ - label, - { selector, attribute, cellIndex }, - ] of Object.entries(tableFields)) { - let element: Element | null = null; - - if (cellIndex !== undefined && cellIndex >= 0) { - let td: Element | null = - currentRow.children[cellIndex] || null; - - if (!td && currentRow.shadowRoot) { - const shadowCells = currentRow.shadowRoot.children; - if (shadowCells && shadowCells.length > cellIndex) { - td = shadowCells[cellIndex]; - } - } - - if (td) { - element = this.queryElement(td, selector); - - if ( - !element && - selector - .split(/(?:>>|:>>)/) - .pop() - ?.includes("td:nth-child") - ) { - element = td; - } - - if (!element) { - const tagOnlySelector = selector.split(".")[0]; - element = this.queryElement(td, tagOnlySelector); - } - - if (!element) { - let currentElement: Element | null = td; - while ( - currentElement && - currentElement.children.length > 0 - ) { - let foundContentChild = false; - for (const child of Array.from( - currentElement.children - )) { - if (this.extractValue(child, attribute)) { - currentElement = child; - foundContentChild = true; - break; - } - } - if (!foundContentChild) break; - } - element = currentElement; - } - } - } else { - element = this.queryElement(currentRow, selector); - } - - if (element) { - const value = this.extractValue(element, attribute); - if (value !== null && value !== "") { - record[label] = value; - } else { - console.warn( - `❌ No value for ${label} in row ${rowIndex + 1}` - ); - record[label] = ""; - } - } else { - console.warn( - `❌ Element not found for ${label} with selector:`, - selector - ); - record[label] = ""; - } - } - - if (Object.values(record).some((value) => value !== "")) { - tableData.push(record); - } - } - } + // Step 5: Add record if it has any non-empty values + if (Object.values(record).some((value) => value !== "")) { + extractedData.push(record); + } else { + console.warn( + ` ⚠️ Skipping empty record for container ${containerIndex + 1}` + ); } } - // Extract non-table data - const nonTableData: ExtractedListData[] = []; - for ( - let containerIndex = 0; - containerIndex < containers.length; - containerIndex++ - ) { - if (nonTableData.length >= limit) break; + return extractedData; + } catch (error) { + console.error("💥 Error in client-side extractListData:", error); + return []; + } + }; - const container = containers[containerIndex]; - const { nonTableFields } = containerFields[containerIndex]; + // Create indexed XPath for specific container instance + private createIndexedXPath( + childSelector: string, + listSelector: string, + containerIndex: number + ): string { + // Check if the child selector contains the list selector pattern + if (childSelector.includes(listSelector.replace("//", ""))) { + // Replace the list selector part with indexed version + const listPattern = listSelector.replace("//", ""); + const indexedListSelector = `(${listSelector})[${containerIndex}]`; + + const indexedSelector = childSelector.replace( + `//${listPattern}`, + indexedListSelector + ); - if (Object.keys(nonTableFields).length > 0) { - const record: ExtractedListData = {}; + return indexedSelector; + } else { + // If pattern doesn't match, create a more generic indexed selector + // This is a fallback approach + console.warn(` ⚠️ Pattern doesn't match, using fallback approach`); + return `(${listSelector})[${containerIndex}]${childSelector.replace( + "//", + "/" + )}`; + } + } - for (const [label, { selector, attribute }] of Object.entries( - nonTableFields - )) { - const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0]; - const element = this.queryElement(container, relativeSelector); + // Helper method for single XPath evaluation + private evaluateXPathSingle = ( + document: Document, + xpath: string + ): Element | null => { + try { + const result = document.evaluate( + xpath, + document, + null, + XPathResult.FIRST_ORDERED_NODE_TYPE, + null + ); - if (element) { - const value = this.extractValue(element, attribute); - if (value !== null && value !== "") { - record[label] = value; - } else { - console.warn( - `❌ No value for ${label} in container ${containerIndex + 1}` - ); - record[label] = ""; - } - } else { - console.warn( - `❌ Element not found for ${label} with selector:`, - selector - ); - record[label] = ""; - } - } + const element = result.singleNodeValue as Element | null; - if (Object.values(record).some((value) => value !== "")) { - nonTableData.push(record); - } - } + if (!element) { + console.warn(`❌ XPath found no element for: ${xpath}`); } - // Combine and limit results - const extractedData = [...tableData, ...nonTableData].slice(0, limit); - - return extractedData; + return element; } catch (error) { - console.error("Error in client-side extractListData:", error); - return []; + console.error("❌ XPath evaluation failed:", xpath, error); + return null; } }; } diff --git a/src/helpers/clientSelectorGenerator.ts b/src/helpers/clientSelectorGenerator.ts index 28134dbb6..92dda20e3 100644 --- a/src/helpers/clientSelectorGenerator.ts +++ b/src/helpers/clientSelectorGenerator.ts @@ -1,5 +1,3 @@ -// utils/clientSelectorGenerator.ts - interface Coordinates { x: number; y: number; @@ -87,11 +85,45 @@ interface Action { hasOnlyText: boolean; } +export interface ElementFingerprint { + tagName: string; + normalizedClasses: string; + childrenCount: number; + childrenStructure: string; + attributes: string; + depth: number; + textCharacteristics: { + hasText: boolean; + textLength: number; + hasLinks: number; + hasImages: number; + hasButtons: number; + }; + signature: string; +} + +interface ElementGroup { + elements: HTMLElement[]; + fingerprint: ElementFingerprint; + representative: HTMLElement; +} + class ClientSelectorGenerator { private listSelector: string = ""; private getList: boolean = false; private paginationMode: boolean = false; + private elementGroups: Map = new Map(); + private groupedElements: Set = new Set(); + private lastAnalyzedDocument: Document | null = null; + private groupingConfig = { + minGroupSize: 2, + similarityThreshold: 0.7, + minWidth: 50, + minHeight: 20, + excludeSelectors: ["script", "style", "meta", "link", "title", "head"], + }; + // Add setter methods for state management public setListSelector(selector: string): void { this.listSelector = selector; @@ -117,6 +149,386 @@ class ClientSelectorGenerator { }; } + /** + * Normalize class names by removing dynamic/unique parts + */ + private normalizeClasses(classList: DOMTokenList): string { + return Array.from(classList) + .filter((cls) => { + // Filter out classes that look like they contain IDs or dynamic content + return !cls.match(/\d{3,}|uuid|hash|id-|_\d+$/i); + }) + .sort() + .join(" "); + } + + /** + * Get element's structural fingerprint for grouping + */ + private getStructuralFingerprint( + element: HTMLElement + ): ElementFingerprint | null { + if (element.nodeType !== Node.ELEMENT_NODE) return null; + + const tagName = element.tagName.toLowerCase(); + if (this.groupingConfig.excludeSelectors.includes(tagName)) return null; + + const children = Array.from(element.children); + const childrenStructure = children.map((child) => ({ + tag: child.tagName.toLowerCase(), + classes: this.normalizeClasses(child.classList), + hasText: (child.textContent ?? "").trim().length > 0, + })); + + const normalizedClasses = this.normalizeClasses(element.classList); + + // Get attributes (excluding unique identifiers) + const relevantAttributes = Array.from(element.attributes) + .filter( + (attr) => + !["id", "style", "data-reactid", "data-react-checksum"].includes( + attr.name.toLowerCase() + ) + ) + .filter( + (attr) => + !attr.name.startsWith("data-") || + attr.name === "data-type" || + attr.name === "data-role" + ) + .map((attr) => `${attr.name}=${attr.value}`) + .sort(); + + // Calculate element depth + let depth = 0; + let parent = element.parentElement; + while (parent && depth < 20) { + depth++; + parent = parent.parentElement; + } + + // Get text content characteristics + const textContent = (element.textContent ?? "").trim(); + const textCharacteristics = { + hasText: textContent.length > 0, + textLength: Math.floor(textContent.length / 20) * 20, + hasLinks: element.querySelectorAll("a").length, + hasImages: element.querySelectorAll("img").length, + hasButtons: element.querySelectorAll( + 'button, input[type="button"], input[type="submit"]' + ).length, + }; + + const signature = `${tagName}::${normalizedClasses}::${ + children.length + }::${JSON.stringify(childrenStructure)}::${relevantAttributes.join("|")}`; + + return { + tagName, + normalizedClasses, + childrenCount: children.length, + childrenStructure: JSON.stringify(childrenStructure), + attributes: relevantAttributes.join("|"), + depth, + textCharacteristics, + signature, + }; + } + + /** + * Calculate similarity between two fingerprints + */ + private calculateSimilarity( + fp1: ElementFingerprint, + fp2: ElementFingerprint + ): number { + if (!fp1 || !fp2) return 0; + + let score = 0; + let maxScore = 0; + + // Tag name must match + maxScore += 10; + if (fp1.tagName === fp2.tagName) score += 10; + else return 0; + + // Class similarity + maxScore += 8; + if (fp1.normalizedClasses === fp2.normalizedClasses) score += 8; + else if (fp1.normalizedClasses && fp2.normalizedClasses) { + const classes1 = fp1.normalizedClasses.split(" ").filter((c) => c); + const classes2 = fp2.normalizedClasses.split(" ").filter((c) => c); + const commonClasses = classes1.filter((c) => classes2.includes(c)); + if (classes1.length > 0 && classes2.length > 0) { + score += + (commonClasses.length / Math.max(classes1.length, classes2.length)) * + 8; + } + } + + // Children structure + maxScore += 8; + if (fp1.childrenStructure === fp2.childrenStructure) score += 8; + else if (fp1.childrenCount === fp2.childrenCount) score += 4; + + // Attributes similarity + maxScore += 5; + if (fp1.attributes === fp2.attributes) score += 5; + else if (fp1.attributes && fp2.attributes) { + const attrs1 = fp1.attributes.split("|").filter((a) => a); + const attrs2 = fp2.attributes.split("|").filter((a) => a); + const commonAttrs = attrs1.filter((a) => attrs2.includes(a)); + if (attrs1.length > 0 && attrs2.length > 0) { + score += + (commonAttrs.length / Math.max(attrs1.length, attrs2.length)) * 5; + } + } + + // Depth similarity + maxScore += 2; + if (Math.abs(fp1.depth - fp2.depth) <= 1) score += 2; + else if (Math.abs(fp1.depth - fp2.depth) <= 2) score += 1; + + // Text characteristics similarity + maxScore += 3; + const tc1 = fp1.textCharacteristics; + const tc2 = fp2.textCharacteristics; + if (tc1.hasText === tc2.hasText) score += 1; + if (Math.abs(tc1.textLength - tc2.textLength) <= 40) score += 1; + if (tc1.hasLinks === tc2.hasLinks && tc1.hasImages === tc2.hasImages) + score += 1; + + return maxScore > 0 ? score / maxScore : 0; + } + + public analyzeElementGroups(iframeDoc: Document): void { + // Only re-analyze if document changed + if ( + this.lastAnalyzedDocument === iframeDoc && + this.elementGroups.size > 0 + ) { + return; + } + + // Clear previous analysis + this.elementGroups.clear(); + this.groupedElements.clear(); + this.lastAnalyzedDocument = iframeDoc; + + // Get all visible elements + const allElements = Array.from(iframeDoc.querySelectorAll("*")).filter( + (el) => { + const rect = el.getBoundingClientRect(); + return rect.width > 0 && rect.height > 0; // Only visible elements + } + ) as HTMLElement[]; + + // Create fingerprints for all elements + const elementFingerprints = new Map(); + + allElements.forEach((element) => { + const fingerprint = this.getStructuralFingerprint(element); + if (fingerprint) { + elementFingerprints.set(element, fingerprint); + } + }); + + // Find similar groups using similarity scoring + const similarGroups: ElementGroup[] = []; + const processedElements = new Set(); + + elementFingerprints.forEach((fingerprint, element) => { + if (processedElements.has(element)) return; + + const currentGroup = [element]; + processedElements.add(element); + + // Find similar elements + elementFingerprints.forEach((otherFingerprint, otherElement) => { + if (processedElements.has(otherElement)) return; + + const similarity = this.calculateSimilarity( + fingerprint, + otherFingerprint + ); + if (similarity >= this.groupingConfig.similarityThreshold) { + currentGroup.push(otherElement); + processedElements.add(otherElement); + } + }); + + // Add group if it has enough members AND has meaningful children + if (currentGroup.length >= this.groupingConfig.minGroupSize) { + // Check if the representative element has meaningful children + const hasChildren = this.hasAnyMeaningfulChildren(element); + + if (hasChildren) { + const group: ElementGroup = { + elements: currentGroup, + fingerprint, + representative: element, + }; + similarGroups.push(group); + + // Map each element to its group + currentGroup.forEach((el) => { + this.elementGroups.set(el, group); + this.groupedElements.add(el); + }); + } + } + }); + + // Sort groups by size and relevance + similarGroups.sort((a, b) => { + // Prioritize by size first + if (b.elements.length !== a.elements.length) + return b.elements.length - a.elements.length; + + // Then by element size + const aSize = + a.representative.getBoundingClientRect().width * + a.representative.getBoundingClientRect().height; + const bSize = + b.representative.getBoundingClientRect().width * + b.representative.getBoundingClientRect().height; + return bSize - aSize; + }); + } + + /** + * Check if element has any meaningful children that can be extracted + */ + private hasAnyMeaningfulChildren(element: HTMLElement): boolean { + const meaningfulChildren = this.getMeaningfulChildren(element); + return meaningfulChildren.length > 0; + } + + /** + * Get meaningful children (those with text, links, images, etc.) + */ + private getMeaningfulChildren(element: HTMLElement): HTMLElement[] { + const meaningfulChildren: HTMLElement[] = []; + + const traverse = (el: HTMLElement) => { + Array.from(el.children).forEach((child) => { + const htmlChild = child as HTMLElement; + + // Check if this child has meaningful content + if (this.isMeaningfulElement(htmlChild)) { + meaningfulChildren.push(htmlChild); + } else { + // If not meaningful itself, check its children + traverse(htmlChild); + } + }); + }; + + traverse(element); + return meaningfulChildren; + } + + /** + * Check if element has meaningful content for extraction + */ + private isMeaningfulElement(element: HTMLElement): boolean { + const tagName = element.tagName.toLowerCase(); + const text = (element.textContent || "").trim(); + const hasHref = element.hasAttribute("href"); + const hasSrc = element.hasAttribute("src"); + + // Meaningful if it has text content, is a link, image, or input + return ( + text.length > 0 || + hasHref || + hasSrc || + ["a", "img", "input", "button", "select"].includes(tagName) + ); + } + + /** + * Check if an element is part of a group (for highlighting) + */ + public isElementGrouped(element: HTMLElement): boolean { + return this.groupedElements.has(element); + } + + /** + * Get the group for a specific element + */ + public getElementGroup(element: HTMLElement): ElementGroup | null { + return this.elementGroups.get(element) || null; + } + + /** + * Modified container finding that only returns grouped elements + */ + private findGroupedContainerAtPoint( + x: number, + y: number, + iframeDoc: Document + ): HTMLElement | null { + // Ensure groups are analyzed + this.analyzeElementGroups(iframeDoc); + + // Get all elements at the point + const elementsAtPoint = iframeDoc.elementsFromPoint(x, y) as HTMLElement[]; + if (!elementsAtPoint.length) return null; + + // In list mode without selector, transform table cells to rows and prioritize grouped elements + if (this.getList === true && this.listSelector === "") { + const transformedElements: HTMLElement[] = []; + + elementsAtPoint.forEach((element) => { + if (element.tagName === "TD" || element.tagName === "TH") { + // Find parent TR for table cells + const parentRow = element.closest("tr") as HTMLElement; + if (parentRow && !transformedElements.includes(parentRow)) { + transformedElements.push(parentRow); + } + } else { + // Keep non-table-cell elements as is + if (!transformedElements.includes(element)) { + transformedElements.push(element); + } + } + }); + + // Now filter for grouped elements from the transformed list + const groupedElementsAtPoint = transformedElements.filter((element) => + this.isElementGrouped(element) + ); + + if (groupedElementsAtPoint.length > 0) { + // Sort by DOM depth (deeper elements first for more specificity) + groupedElementsAtPoint.sort((a, b) => { + const aDepth = this.getElementDepth(a); + const bDepth = this.getElementDepth(b); + return bDepth - aDepth; + }); + + const selectedElement = groupedElementsAtPoint[0]; + return selectedElement; + } + + return null; + } + + // For other modes or when list selector exists, return regular element + return this.getDeepestElementFromPoint(elementsAtPoint); + } + + private getElementDepth(element: HTMLElement): number { + let depth = 0; + let current = element; + while (current && current !== this.lastAnalyzedDocument?.body) { + depth++; + current = current.parentElement as HTMLElement; + if (depth > 50) break; + } + return depth; + } + public getElementInformation = ( iframeDoc: Document, coordinates: Coordinates, @@ -435,6 +847,153 @@ class ClientSelectorGenerator { } return null; } else { + const originalEl = this.findGroupedContainerAtPoint( + coordinates.x, + coordinates.y, + iframeDoc + ); + + if (originalEl) { + let element = originalEl; + + if (element.tagName === "TD" || element.tagName === "TH") { + const tableParent = element.closest("table"); + if (tableParent) { + element = tableParent; + } + } + + const ownerDocument = element.ownerDocument; + const frameElement = ownerDocument?.defaultView?.frameElement; + const isIframeContent = Boolean(frameElement); + const isFrameContent = frameElement?.tagName === "FRAME"; + + const containingShadowRoot = element.getRootNode() as ShadowRoot; + const isShadowRoot = containingShadowRoot instanceof ShadowRoot; + + let info: { + tagName: string; + hasOnlyText?: boolean; + innerText?: string; + url?: string; + imageUrl?: string; + attributes?: Record; + innerHTML?: string; + outerHTML?: string; + isIframeContent?: boolean; + isFrameContent?: boolean; + iframeURL?: string; + frameURL?: string; + iframeIndex?: number; + frameIndex?: number; + frameHierarchy?: string[]; + isShadowRoot?: boolean; + shadowRootMode?: string; + shadowRootContent?: string; + } = { + tagName: element?.tagName ?? "", + isIframeContent, + isFrameContent, + isShadowRoot, + }; + + if (isIframeContent || isFrameContent) { + if (isIframeContent && !isFrameContent) { + info.iframeURL = (frameElement as HTMLIFrameElement).src; + } else if (isFrameContent) { + info.frameURL = (frameElement as HTMLFrameElement).src; + } + + let currentFrame = frameElement; + const frameHierarchy: string[] = []; + let frameIndex = 0; + + while (currentFrame) { + frameHierarchy.unshift( + currentFrame.id || + currentFrame.getAttribute("name") || + (currentFrame as HTMLFrameElement).src || + `${currentFrame.tagName.toLowerCase()}[${frameIndex}]` + ); + + const parentDoc = currentFrame.ownerDocument; + currentFrame = parentDoc?.defaultView?.frameElement; + frameIndex++; + } + + info.frameHierarchy = frameHierarchy; + if (isIframeContent && !isFrameContent) { + info.iframeIndex = frameIndex - 1; + } else if (isFrameContent) { + info.frameIndex = frameIndex - 1; + } + } + + if (isShadowRoot) { + info.shadowRootMode = containingShadowRoot.mode; + info.shadowRootContent = containingShadowRoot.innerHTML; + } + + if (element) { + info.attributes = Array.from(element.attributes).reduce( + (acc, attr) => { + acc[attr.name] = attr.value; + return acc; + }, + {} as Record + ); + + if (element.tagName === "A") { + info.url = (element as HTMLAnchorElement).href; + info.innerText = element.textContent ?? ""; + } else if (element.tagName === "IMG") { + info.imageUrl = (element as HTMLImageElement).src; + } else if (element?.tagName === "SELECT") { + const selectElement = element as HTMLSelectElement; + info.innerText = + selectElement.options[selectElement.selectedIndex]?.text ?? ""; + info.attributes = { + ...info.attributes, + selectedValue: selectElement.value, + }; + } else if ( + element?.tagName === "INPUT" && + ((element as HTMLInputElement).type === "time" || + (element as HTMLInputElement).type === "date") + ) { + info.innerText = (element as HTMLInputElement).value; + } else { + info.hasOnlyText = + element.children.length === 0 && + element.textContent !== null && + element.textContent.trim().length > 0; + info.innerText = element.textContent ?? ""; + } + + info.innerHTML = element.innerHTML; + info.outerHTML = element.outerHTML; + } + + return info; + } + return null; + } + } catch (error) { + const { message, stack } = error as Error; + console.error("Error while retrieving selector:", message); + console.error("Stack:", stack); + } + }; + + private getRect = ( + iframeDoc: Document, + coordinates: Coordinates, + listSelector: string, + getList: boolean, + isDOMMode: boolean = false + ) => { + try { + if (!getList || listSelector !== "") { const getDeepestElementFromPoint = ( x: number, y: number @@ -442,654 +1001,40 @@ class ClientSelectorGenerator { let elements = iframeDoc.elementsFromPoint(x, y) as HTMLElement[]; if (!elements.length) return null; - const findContainerElement = ( + const findDeepestElement = ( elements: HTMLElement[] ): HTMLElement | null => { if (!elements.length) return null; if (elements.length === 1) return elements[0]; - for (let i = 0; i < elements.length; i++) { - const element = elements[i]; - const rect = element.getBoundingClientRect(); + let deepestElement = elements[0]; + let maxDepth = 0; - if (rect.width >= 30 && rect.height >= 30) { - const hasChildrenInList = elements.some( - (otherElement, j) => i !== j && element.contains(otherElement) - ); + for (const element of elements) { + let depth = 0; + let current = element; - if (hasChildrenInList) { - return element; + while (current) { + depth++; + if (current.parentElement) { + current = current.parentElement; + } else { + break; } } + + if (depth > maxDepth) { + maxDepth = depth; + deepestElement = element; + } } - return elements[0]; + return deepestElement; }; - let deepestElement = findContainerElement(elements); + let deepestElement = findDeepestElement(elements); if (!deepestElement) return null; - if (deepestElement.tagName === "A") { - for (let i = 1; i < elements.length; i++) { - const sibling = elements[i]; - if ( - !deepestElement.contains(sibling) && - !sibling.contains(deepestElement) - ) { - const anchorRect = deepestElement.getBoundingClientRect(); - const siblingRect = sibling.getBoundingClientRect(); - - const isOverlapping = !( - siblingRect.right < anchorRect.left || - siblingRect.left > anchorRect.right || - siblingRect.bottom < anchorRect.top || - siblingRect.top > anchorRect.bottom - ); - - if (isOverlapping) { - deepestElement = sibling; - break; - } - } - } - } - - const traverseShadowDOM = (element: HTMLElement): HTMLElement => { - let current = element; - let shadowRoot = current.shadowRoot; - let deepest = current; - let depth = 0; - const MAX_SHADOW_DEPTH = 4; - - while (shadowRoot && depth < MAX_SHADOW_DEPTH) { - const shadowElement = shadowRoot.elementFromPoint( - x, - y - ) as HTMLElement; - if (!shadowElement || shadowElement === current) break; - - deepest = shadowElement; - current = shadowElement; - shadowRoot = current.shadowRoot; - depth++; - } - - return deepest; - }; - - const isInFrameset = () => { - let node = deepestElement; - while (node && node.parentElement) { - if (node.tagName === "FRAMESET" || node.tagName === "FRAME") { - return true; - } - node = node.parentElement; - } - return false; - }; - - if (deepestElement.tagName === "IFRAME") { - let currentIframe = deepestElement as HTMLIFrameElement; - let depth = 0; - const MAX_IFRAME_DEPTH = 4; - - while (currentIframe && depth < MAX_IFRAME_DEPTH) { - try { - const iframeRect = currentIframe.getBoundingClientRect(); - const iframeX = x - iframeRect.left; - const iframeY = y - iframeRect.top; - - const iframeDocument = - currentIframe.contentDocument || - currentIframe.contentWindow?.document; - if (!iframeDocument) break; - - const iframeElement = iframeDocument.elementFromPoint( - iframeX, - iframeY - ) as HTMLElement; - if (!iframeElement) break; - - deepestElement = traverseShadowDOM(iframeElement); - - if (iframeElement.tagName === "IFRAME") { - currentIframe = iframeElement as HTMLIFrameElement; - depth++; - } else { - break; - } - } catch (error) { - console.warn("Cannot access iframe content:", error); - break; - } - } - } else if (deepestElement.tagName === "FRAME" || isInFrameset()) { - const framesToCheck = []; - - if (deepestElement.tagName === "FRAME") { - framesToCheck.push(deepestElement as HTMLFrameElement); - } - - if (isInFrameset()) { - iframeDoc.querySelectorAll("frame").forEach((frame) => { - framesToCheck.push(frame as HTMLFrameElement); - }); - } - - let frameDepth = 0; - const MAX_FRAME_DEPTH = 4; - - const processFrames = ( - frames: HTMLFrameElement[], - currentDepth: number - ) => { - if (currentDepth >= MAX_FRAME_DEPTH) return; - - for (const frameElement of frames) { - try { - const frameRect = frameElement.getBoundingClientRect(); - const frameX = x - frameRect.left; - const frameY = y - frameRect.top; - - if ( - frameX < 0 || - frameY < 0 || - frameX > frameRect.width || - frameY > frameRect.height - ) { - continue; - } - - const frameDocument = - frameElement.contentDocument || - frameElement.contentWindow?.document; - - if (!frameDocument) continue; - - const frameElementAtPoint = frameDocument.elementFromPoint( - frameX, - frameY - ) as HTMLElement; - if (!frameElementAtPoint) continue; - - deepestElement = traverseShadowDOM(frameElementAtPoint); - - if (frameElementAtPoint.tagName === "FRAME") { - processFrames( - [frameElementAtPoint as HTMLFrameElement], - currentDepth + 1 - ); - } - - break; - } catch (error) { - console.warn("Cannot access frame content:", error); - continue; - } - } - }; - - processFrames(framesToCheck, frameDepth); - } else { - deepestElement = traverseShadowDOM(deepestElement); - } - - return deepestElement; - }; - - const originalEl = getDeepestElementFromPoint( - coordinates.x, - coordinates.y - ); - if (originalEl) { - let element = originalEl; - - if (element.tagName === "TD" || element.tagName === "TH") { - const tableParent = element.closest("table"); - if (tableParent) { - element = tableParent; - } - } - - const ownerDocument = element.ownerDocument; - const frameElement = ownerDocument?.defaultView?.frameElement; - const isIframeContent = Boolean(frameElement); - const isFrameContent = frameElement?.tagName === "FRAME"; - - const containingShadowRoot = element.getRootNode() as ShadowRoot; - const isShadowRoot = containingShadowRoot instanceof ShadowRoot; - - let info: { - tagName: string; - hasOnlyText?: boolean; - innerText?: string; - url?: string; - imageUrl?: string; - attributes?: Record; - innerHTML?: string; - outerHTML?: string; - isIframeContent?: boolean; - isFrameContent?: boolean; - iframeURL?: string; - frameURL?: string; - iframeIndex?: number; - frameIndex?: number; - frameHierarchy?: string[]; - isShadowRoot?: boolean; - shadowRootMode?: string; - shadowRootContent?: string; - } = { - tagName: element?.tagName ?? "", - isIframeContent, - isFrameContent, - isShadowRoot, - }; - - if (isIframeContent || isFrameContent) { - if (isIframeContent && !isFrameContent) { - info.iframeURL = (frameElement as HTMLIFrameElement).src; - } else if (isFrameContent) { - info.frameURL = (frameElement as HTMLFrameElement).src; - } - - let currentFrame = frameElement; - const frameHierarchy: string[] = []; - let frameIndex = 0; - - while (currentFrame) { - frameHierarchy.unshift( - currentFrame.id || - currentFrame.getAttribute("name") || - (currentFrame as HTMLFrameElement).src || - `${currentFrame.tagName.toLowerCase()}[${frameIndex}]` - ); - - const parentDoc = currentFrame.ownerDocument; - currentFrame = parentDoc?.defaultView?.frameElement; - frameIndex++; - } - - info.frameHierarchy = frameHierarchy; - if (isIframeContent && !isFrameContent) { - info.iframeIndex = frameIndex - 1; - } else if (isFrameContent) { - info.frameIndex = frameIndex - 1; - } - } - - if (isShadowRoot) { - info.shadowRootMode = containingShadowRoot.mode; - info.shadowRootContent = containingShadowRoot.innerHTML; - } - - if (element) { - info.attributes = Array.from(element.attributes).reduce( - (acc, attr) => { - acc[attr.name] = attr.value; - return acc; - }, - {} as Record - ); - - if (element.tagName === "A") { - info.url = (element as HTMLAnchorElement).href; - info.innerText = element.textContent ?? ""; - } else if (element.tagName === "IMG") { - info.imageUrl = (element as HTMLImageElement).src; - } else if (element?.tagName === "SELECT") { - const selectElement = element as HTMLSelectElement; - info.innerText = - selectElement.options[selectElement.selectedIndex]?.text ?? ""; - info.attributes = { - ...info.attributes, - selectedValue: selectElement.value, - }; - } else if ( - element?.tagName === "INPUT" && - ((element as HTMLInputElement).type === "time" || - (element as HTMLInputElement).type === "date") - ) { - info.innerText = (element as HTMLInputElement).value; - } else { - info.hasOnlyText = - element.children.length === 0 && - element.textContent !== null && - element.textContent.trim().length > 0; - info.innerText = element.textContent ?? ""; - } - - info.innerHTML = element.innerHTML; - info.outerHTML = element.outerHTML; - } - - return info; - } - return null; - } - } catch (error) { - const { message, stack } = error as Error; - console.error("Error while retrieving selector:", message); - console.error("Stack:", stack); - } - }; - - private getRect = ( - iframeDoc: Document, - coordinates: Coordinates, - listSelector: string, - getList: boolean, - isDOMMode: boolean = false - ) => { - try { - if (!getList || listSelector !== "") { - const getDeepestElementFromPoint = ( - x: number, - y: number - ): HTMLElement | null => { - let elements = iframeDoc.elementsFromPoint(x, y) as HTMLElement[]; - if (!elements.length) return null; - - const findDeepestElement = ( - elements: HTMLElement[] - ): HTMLElement | null => { - if (!elements.length) return null; - if (elements.length === 1) return elements[0]; - - let deepestElement = elements[0]; - let maxDepth = 0; - - for (const element of elements) { - let depth = 0; - let current = element; - - while (current) { - depth++; - if (current.parentElement) { - current = current.parentElement; - } else { - break; - } - } - - if (depth > maxDepth) { - maxDepth = depth; - deepestElement = element; - } - } - - return deepestElement; - }; - - let deepestElement = findDeepestElement(elements); - if (!deepestElement) return null; - - const traverseShadowDOM = (element: HTMLElement): HTMLElement => { - let current = element; - let shadowRoot = current.shadowRoot; - let deepest = current; - let depth = 0; - const MAX_SHADOW_DEPTH = 4; - - while (shadowRoot && depth < MAX_SHADOW_DEPTH) { - const shadowElement = shadowRoot.elementFromPoint( - x, - y - ) as HTMLElement; - if (!shadowElement || shadowElement === current) break; - - deepest = shadowElement; - current = shadowElement; - shadowRoot = current.shadowRoot; - depth++; - } - - return deepest; - }; - - const isInFrameset = () => { - let node = deepestElement; - while (node && node.parentElement) { - if (node.tagName === "FRAMESET" || node.tagName === "FRAME") { - return true; - } - node = node.parentElement; - } - return false; - }; - - if (deepestElement.tagName === "IFRAME") { - let currentIframe = deepestElement as HTMLIFrameElement; - let depth = 0; - const MAX_IFRAME_DEPTH = 4; - - while (currentIframe && depth < MAX_IFRAME_DEPTH) { - try { - const iframeRect = currentIframe.getBoundingClientRect(); - const iframeX = x - iframeRect.left; - const iframeY = y - iframeRect.top; - - const iframeDocument = - currentIframe.contentDocument || - currentIframe.contentWindow?.document; - if (!iframeDocument) break; - - const iframeElement = iframeDocument.elementFromPoint( - iframeX, - iframeY - ) as HTMLElement; - if (!iframeElement) break; - - deepestElement = traverseShadowDOM(iframeElement); - - if (iframeElement.tagName === "IFRAME") { - currentIframe = iframeElement as HTMLIFrameElement; - depth++; - } else { - break; - } - } catch (error) { - console.warn("Cannot access iframe content:", error); - break; - } - } - } else if (deepestElement.tagName === "FRAME" || isInFrameset()) { - const framesToCheck = []; - - if (deepestElement.tagName === "FRAME") { - framesToCheck.push(deepestElement as HTMLFrameElement); - } - - if (isInFrameset()) { - iframeDoc.querySelectorAll("frame").forEach((frame) => { - framesToCheck.push(frame as HTMLFrameElement); - }); - } - - let frameDepth = 0; - const MAX_FRAME_DEPTH = 4; - - const processFrames = ( - frames: HTMLFrameElement[], - currentDepth: number - ) => { - if (currentDepth >= MAX_FRAME_DEPTH) return; - - for (const frameElement of frames) { - try { - const frameRect = frameElement.getBoundingClientRect(); - const frameX = x - frameRect.left; - const frameY = y - frameRect.top; - - if ( - frameX < 0 || - frameY < 0 || - frameX > frameRect.width || - frameY > frameRect.height - ) { - continue; - } - - const frameDocument = - frameElement.contentDocument || - frameElement.contentWindow?.document; - - if (!frameDocument) continue; - - const frameElementAtPoint = frameDocument.elementFromPoint( - frameX, - frameY - ) as HTMLElement; - if (!frameElementAtPoint) continue; - - deepestElement = traverseShadowDOM(frameElementAtPoint); - - if (frameElementAtPoint.tagName === "FRAME") { - processFrames( - [frameElementAtPoint as HTMLFrameElement], - currentDepth + 1 - ); - } - - break; - } catch (error) { - console.warn("Cannot access frame content:", error); - continue; - } - } - }; - - processFrames(framesToCheck, frameDepth); - } else { - deepestElement = traverseShadowDOM(deepestElement); - } - - return deepestElement; - }; - - const el = getDeepestElementFromPoint(coordinates.x, coordinates.y); - if (el) { - // Prioritize Link (DO NOT REMOVE) - const { parentElement } = el; - const element = parentElement?.tagName === "A" ? parentElement : el; - - const rectangle = element?.getBoundingClientRect(); - if (rectangle) { - const createRectObject = (rect: DOMRect) => ({ - x: rect.x, - y: rect.y, - width: rect.width, - height: rect.height, - top: rect.top, - right: rect.right, - bottom: rect.bottom, - left: rect.left, - toJSON() { - return { - x: this.x, - y: this.y, - width: this.width, - height: this.height, - top: this.top, - right: this.right, - bottom: this.bottom, - left: this.left, - }; - }, - }); - - if (isDOMMode) { - // For DOM mode, return iframe-relative coordinates - return createRectObject(rectangle); - } else { - // For screenshot mode, adjust coordinates relative to the top window - let adjustedRect = createRectObject(rectangle); - let currentWindow = element.ownerDocument.defaultView; - - while (currentWindow !== window.top) { - const frameElement = - currentWindow?.frameElement as HTMLIFrameElement; - if (!frameElement) break; - - const frameRect = frameElement.getBoundingClientRect(); - adjustedRect = createRectObject({ - x: adjustedRect.x + frameRect.x, - y: adjustedRect.y + frameRect.y, - width: adjustedRect.width, - height: adjustedRect.height, - top: adjustedRect.top + frameRect.top, - right: adjustedRect.right + frameRect.left, - bottom: adjustedRect.bottom + frameRect.top, - left: adjustedRect.left + frameRect.left, - } as DOMRect); - - currentWindow = frameElement.ownerDocument.defaultView; - } - - return adjustedRect; - } - } - } - return null; - } else { - const getDeepestElementFromPoint = ( - x: number, - y: number - ): HTMLElement | null => { - let elements = iframeDoc.elementsFromPoint(x, y) as HTMLElement[]; - if (!elements.length) return null; - - const findContainerElement = ( - elements: HTMLElement[] - ): HTMLElement | null => { - if (!elements.length) return null; - if (elements.length === 1) return elements[0]; - - for (let i = 0; i < elements.length; i++) { - const element = elements[i]; - const rect = element.getBoundingClientRect(); - - if (rect.width >= 30 && rect.height >= 30) { - const hasChildrenInList = elements.some( - (otherElement, j) => i !== j && element.contains(otherElement) - ); - - if (hasChildrenInList) { - return element; - } - } - } - - return elements[0]; - }; - - let deepestElement = findContainerElement(elements); - if (!deepestElement) return null; - - if (deepestElement.tagName === "A") { - for (let i = 1; i < elements.length; i++) { - const sibling = elements[i]; - if ( - !deepestElement.contains(sibling) && - !sibling.contains(deepestElement) - ) { - const anchorRect = deepestElement.getBoundingClientRect(); - const siblingRect = sibling.getBoundingClientRect(); - - const isOverlapping = !( - siblingRect.right < anchorRect.left || - siblingRect.left > anchorRect.right || - siblingRect.bottom < anchorRect.top || - siblingRect.top > anchorRect.bottom - ); - - if (isOverlapping) { - deepestElement = sibling; - break; - } - } - } - } - const traverseShadowDOM = (element: HTMLElement): HTMLElement => { let current = element; let shadowRoot = current.shadowRoot; @@ -1233,9 +1178,75 @@ class ClientSelectorGenerator { return deepestElement; }; - const originalEl = getDeepestElementFromPoint( + const el = getDeepestElementFromPoint(coordinates.x, coordinates.y); + if (el) { + // Prioritize Link (DO NOT REMOVE) + const { parentElement } = el; + const element = parentElement?.tagName === "A" ? parentElement : el; + + const rectangle = element?.getBoundingClientRect(); + if (rectangle) { + const createRectObject = (rect: DOMRect) => ({ + x: rect.x, + y: rect.y, + width: rect.width, + height: rect.height, + top: rect.top, + right: rect.right, + bottom: rect.bottom, + left: rect.left, + toJSON() { + return { + x: this.x, + y: this.y, + width: this.width, + height: this.height, + top: this.top, + right: this.right, + bottom: this.bottom, + left: this.left, + }; + }, + }); + + if (isDOMMode) { + // For DOM mode, return iframe-relative coordinates + return createRectObject(rectangle); + } else { + // For screenshot mode, adjust coordinates relative to the top window + let adjustedRect = createRectObject(rectangle); + let currentWindow = element.ownerDocument.defaultView; + + while (currentWindow !== window.top) { + const frameElement = + currentWindow?.frameElement as HTMLIFrameElement; + if (!frameElement) break; + + const frameRect = frameElement.getBoundingClientRect(); + adjustedRect = createRectObject({ + x: adjustedRect.x + frameRect.x, + y: adjustedRect.y + frameRect.y, + width: adjustedRect.width, + height: adjustedRect.height, + top: adjustedRect.top + frameRect.top, + right: adjustedRect.right + frameRect.left, + bottom: adjustedRect.bottom + frameRect.top, + left: adjustedRect.left + frameRect.left, + } as DOMRect); + + currentWindow = frameElement.ownerDocument.defaultView; + } + + return adjustedRect; + } + } + } + return null; + } else { + const originalEl = this.findGroupedContainerAtPoint( coordinates.x, - coordinates.y + coordinates.y, + iframeDoc ); if (originalEl) { let element = originalEl; @@ -2357,7 +2368,7 @@ class ClientSelectorGenerator { listSelector: string ): SelectorResult => { interface DOMContext { - type: "shadow"; // Remove iframe/frame types since we're inside the iframe + type: "shadow"; element: HTMLElement; container: ShadowRoot; host: HTMLElement; @@ -2365,121 +2376,43 @@ class ClientSelectorGenerator { try { if (!listSelector) { - const getDeepestElementFromPoint = ( - x: number, - y: number - ): HTMLElement | null => { - let elements = iframeDoc.elementsFromPoint(x, y) as HTMLElement[]; - if (!elements.length) return null; - - const findContainerElement = ( - elements: HTMLElement[] - ): HTMLElement | null => { - if (!elements.length) return null; - if (elements.length === 1) return elements[0]; - - for (let i = 0; i < elements.length; i++) { - const element = elements[i]; - const rect = element.getBoundingClientRect(); - - if (rect.width >= 30 && rect.height >= 30) { - const hasChildrenInList = elements.some( - (otherElement, j) => i !== j && element.contains(otherElement) - ); - - if (hasChildrenInList) { - return element; - } - } - } - - return elements[0]; - }; - - let deepestElement = findContainerElement(elements); - if (!deepestElement) return null; - - if (deepestElement.tagName === "A") { - for (let i = 1; i < elements.length; i++) { - const sibling = elements[i]; - if ( - !deepestElement.contains(sibling) && - !sibling.contains(deepestElement) - ) { - const anchorRect = deepestElement.getBoundingClientRect(); - const siblingRect = sibling.getBoundingClientRect(); - - const isOverlapping = !( - siblingRect.right < anchorRect.left || - siblingRect.left > anchorRect.right || - siblingRect.bottom < anchorRect.top || - siblingRect.top > anchorRect.bottom - ); - - if (isOverlapping) { - deepestElement = sibling; - break; - } - } - } - } - - const traverseShadowDOM = (element: HTMLElement): HTMLElement => { - let current = element; - let shadowRoot = current.shadowRoot; - let deepest = current; - let depth = 0; - const MAX_SHADOW_DEPTH = 4; - - while (shadowRoot && depth < MAX_SHADOW_DEPTH) { - const shadowElement = shadowRoot.elementFromPoint( - x, - y - ) as HTMLElement; - if (!shadowElement || shadowElement === current) break; - - deepest = shadowElement; - current = shadowElement; - shadowRoot = current.shadowRoot; - depth++; + function generateXPathSelector( + element: HTMLElement, + relative: boolean = false + ): string { + let xpath = relative + ? element.tagName.toLowerCase() + : `//${element.tagName.toLowerCase()}`; + + // Handle table cells specially + if (element.tagName === "TD" || element.tagName === "TH") { + if (element.parentElement) { + const siblings = Array.from(element.parentElement.children); + const position = siblings.indexOf(element) + 1; + return relative + ? `${element.tagName.toLowerCase()}[${position}]` + : `//tr/${element.tagName.toLowerCase()}[${position}]`; } - - return deepest; - }; - - // REMOVED: All iframe/frame traversal logic since we're already inside the iframe - // Just apply shadow DOM traversal - deepestElement = traverseShadowDOM(deepestElement); - return deepestElement; - }; - - function getNonUniqueSelector(element: HTMLElement): string { - let selector = element.tagName.toLowerCase(); - - // REMOVED: Frame/iframe selector logic since we're already inside - // Keep only regular element logic - - if (selector === "td" && element.parentElement) { - const siblings = Array.from(element.parentElement.children); - const position = siblings.indexOf(element) + 1; - return `${selector}:nth-child(${position})`; } + // Add class-based predicates if (element.className) { const classes = element.className .split(/\s+/) - .filter((cls: string) => Boolean(cls)); - if (classes.length > 0) { - const validClasses = classes.filter( + .filter((cls: string) => Boolean(cls)) + .filter( (cls: string) => !cls.startsWith("!") && !cls.includes(":") ); - if (validClasses.length > 0) { - selector += - "." + validClasses.map((cls) => CSS.escape(cls)).join("."); - } + + if (classes.length > 0) { + const classPredicates = classes + .map((cls) => `contains(@class,'${cls}')`) + .join(" and "); + xpath += `[${classPredicates}]`; } } + // Add positional predicate if there are similar siblings if (element.parentElement) { const siblings = Array.from(element.parentElement.children); const elementClasses = Array.from(element.classList || []); @@ -2492,11 +2425,15 @@ class ClientSelectorGenerator { if (similarSiblings.length > 0) { const position = siblings.indexOf(element) + 1; - selector += `:nth-child(${position})`; + // Remove existing predicates and add position-based one + const baseXpath = relative + ? element.tagName.toLowerCase() + : `//${element.tagName.toLowerCase()}`; + xpath = `${baseXpath}[${position}]`; } } - return selector; + return xpath; } function getContextPath(element: HTMLElement): DOMContext[] { @@ -2506,7 +2443,6 @@ class ClientSelectorGenerator { const MAX_DEPTH = 4; while (current && depth < MAX_DEPTH) { - // ONLY check for shadow DOM, not iframe/frame since we're already inside const rootNode = current.getRootNode(); if (rootNode instanceof ShadowRoot) { path.unshift({ @@ -2519,27 +2455,24 @@ class ClientSelectorGenerator { depth++; continue; } - - // REMOVED: iframe/frame detection logic break; } return path; } - function getSelectorPath(element: HTMLElement | null): string { + function getXPathSelectorPath(element: HTMLElement | null): string { if (!element) return ""; - // Get only shadow DOM context path const contextPath = getContextPath(element); if (contextPath.length > 0) { const selectorParts: string[] = []; contextPath.forEach((context, index) => { - const containerSelector = getNonUniqueSelector(context.host); + const containerSelector = generateXPathSelector(context.host); if (index === contextPath.length - 1) { - const elementSelector = getNonUniqueSelector(element); + const elementSelector = generateXPathSelector(element); selectorParts.push( `${containerSelector} >> ${elementSelector}` ); @@ -2551,15 +2484,17 @@ class ClientSelectorGenerator { return selectorParts.join(" >> "); } - const elementSelector = getNonUniqueSelector(element); + const elementSelector = generateXPathSelector(element); + // For simple cases, return the element selector if ( - elementSelector.includes(".") && - elementSelector.split(".").length > 1 + elementSelector.includes("contains(@class") || + elementSelector.includes("[") ) { return elementSelector; } + // Build path with limited depth const path: string[] = []; let currentElement = element; const MAX_DEPTH = 2; @@ -2570,21 +2505,21 @@ class ClientSelectorGenerator { currentElement !== iframeDoc.body && depth < MAX_DEPTH ) { - const selector = getNonUniqueSelector(currentElement); - path.unshift(selector); + const selector = generateXPathSelector(currentElement); + path.unshift(selector.replace("//", "")); if (!currentElement.parentElement) break; currentElement = currentElement.parentElement; depth++; } - return path.join(" > "); + return "//" + path.join("/"); } - // Main logic to get element and generate selector - const originalEl = getDeepestElementFromPoint( + const originalEl = this.findGroupedContainerAtPoint( coordinates.x, - coordinates.y + coordinates.y, + iframeDoc ); if (!originalEl) return { generalSelector: "" }; @@ -2597,10 +2532,10 @@ class ClientSelectorGenerator { } } - const generalSelector = getSelectorPath(element); + const generalSelector = getXPathSelectorPath(element); return { generalSelector }; } else { - // Similar simplification for when listSelector exists + // Similar logic for when listSelector exists const getDeepestElementFromPoint = ( x: number, y: number @@ -2665,57 +2600,88 @@ class ClientSelectorGenerator { return deepest; }; - // REMOVED: All iframe/frame traversal logic deepestElement = traverseShadowDOM(deepestElement); return deepestElement; }; - function getNonUniqueSelector(element: HTMLElement): string { - let selector = element.tagName.toLowerCase(); + function generateRelativeXPathSelector(element: HTMLElement): string { + let xpath = element.tagName.toLowerCase(); - // REMOVED: Frame/iframe selector logic - - if (selector === "td" && element.parentElement) { + if (xpath === "td" && element.parentElement) { const siblings = Array.from(element.parentElement.children); const position = siblings.indexOf(element) + 1; - return `${selector}:nth-child(${position})`; + return `${xpath}[${position}]`; } - if (element.className) { - const classes = element.className - .split(/\s+/) - .filter((cls: string) => Boolean(cls)); - if (classes.length > 0) { - const validClasses = classes.filter( - (cls: string) => !cls.startsWith("!") && !cls.includes(":") - ); - if (validClasses.length > 0) { - selector += - "." + validClasses.map((cls) => CSS.escape(cls)).join("."); - } - } - } + const className = + typeof element.className === "string" ? element.className : ""; if (element.parentElement) { - const siblings = Array.from(element.parentElement.children); - const elementClasses = Array.from(element.classList || []); + const allSiblings = Array.from(element.parentElement.children); + const sameTagSiblings = allSiblings.filter( + (sibling) => sibling.tagName === element.tagName + ); - const similarSiblings = siblings.filter((sibling) => { - if (sibling === element) return false; - const siblingClasses = Array.from(sibling.classList || []); - return siblingClasses.some((cls) => elementClasses.includes(cls)); - }); + if (sameTagSiblings.length > 1) { + // Multiple siblings with same tag - MUST use position + const position = sameTagSiblings.indexOf(element) + 1; - if (similarSiblings.length > 0) { - const position = siblings.indexOf(element) + 1; - selector += `:nth-child(${position})`; + if (className) { + const classes = className + .split(/\s+/) + .filter((cls: string) => Boolean(cls)) + .filter( + (cls: string) => !cls.startsWith("!") && !cls.includes(":") + ); + + if (classes.length > 0) { + const classPredicates = classes + .map((cls) => `contains(@class,'${cls}')`) + .join(" and "); + xpath += `[${classPredicates}][${position}]`; + } else { + xpath += `[${position}]`; + } + } else { + xpath += `[${position}]`; + } + } else { + // Only one sibling with this tag - classes are sufficient + if (className) { + const classes = className + .split(/\s+/) + .filter((cls: string) => Boolean(cls)) + .filter( + (cls: string) => !cls.startsWith("!") && !cls.includes(":") + ); + + if (classes.length > 0) { + const classPredicates = classes + .map((cls) => `contains(@class,'${cls}')`) + .join(" and "); + xpath += `[${classPredicates}]`; + } + } + } + } else if (className) { + // No parent but has classes + const classes = className + .split(/\s+/) + .filter((cls: string) => Boolean(cls)) + .filter( + (cls: string) => !cls.startsWith("!") && !cls.includes(":") + ); + + if (classes.length > 0) { + const classPredicates = classes + .map((cls) => `contains(@class,'${cls}')`) + .join(" and "); + xpath += `[${classPredicates}]`; } } - return selector; + return `./${xpath}`; // Make it relative } - - // Simplified context path for shadow DOM only function getContextPath(element: HTMLElement): DOMContext[] { const path: DOMContext[] = []; let current = element; @@ -2723,7 +2689,6 @@ class ClientSelectorGenerator { const MAX_DEPTH = 4; while (current && depth < MAX_DEPTH) { - // Only check for shadow DOM const rootNode = current.getRootNode(); if (rootNode instanceof ShadowRoot) { path.unshift({ @@ -2736,65 +2701,41 @@ class ClientSelectorGenerator { depth++; continue; } - break; } return path; } - function getSelectorPath(element: HTMLElement | null): string { + function getRelativeXPathSelectorPath( + element: HTMLElement | null + ): string { if (!element) return ""; - // Get only shadow DOM context path const contextPath = getContextPath(element); if (contextPath.length > 0) { const selectorParts: string[] = []; - contextPath.forEach((context, index) => { - const containerSelector = getNonUniqueSelector(context.host); - - if (index === contextPath.length - 1) { - const elementSelector = getNonUniqueSelector(element); - selectorParts.push( - `${containerSelector} >> ${elementSelector}` - ); - } else { - selectorParts.push(containerSelector); - } - }); - - return selectorParts.join(" >> "); - } - - const elementSelector = getNonUniqueSelector(element); - - if ( - elementSelector.includes(".") && - elementSelector.split(".").length > 1 - ) { - return elementSelector; - } - - const path: string[] = []; - let currentElement = element; - const MAX_DEPTH = 2; - let depth = 0; - - while ( - currentElement && - currentElement !== iframeDoc.body && - depth < MAX_DEPTH - ) { - const selector = getNonUniqueSelector(currentElement); - path.unshift(selector); - - if (!currentElement.parentElement) break; - currentElement = currentElement.parentElement; - depth++; + contextPath.forEach((context, index) => { + const containerSelector = generateRelativeXPathSelector( + context.host + ); + + if (index === contextPath.length - 1) { + const elementSelector = generateRelativeXPathSelector(element); + selectorParts.push( + `${containerSelector} >> ${elementSelector}` + ); + } else { + selectorParts.push(containerSelector); + } + }); + + return selectorParts.join(" >> "); } - return path.join(" > "); + const elementSelector = generateRelativeXPathSelector(element); + return elementSelector; } const originalEl = getDeepestElementFromPoint( @@ -2804,8 +2745,7 @@ class ClientSelectorGenerator { if (!originalEl) return { generalSelector: "" }; let element = originalEl; - - const generalSelector = getSelectorPath(element); + const generalSelector = getRelativeXPathSelectorPath(element); return { generalSelector }; } } catch (error) { @@ -2814,203 +2754,266 @@ class ClientSelectorGenerator { } }; - private getChildSelectors = ( + public getChildSelectors = ( iframeDoc: Document, parentSelector: string ): string[] => { try { - function getNonUniqueSelector(element: HTMLElement): string { - let selector = element.tagName.toLowerCase(); + // Use XPath evaluation to find parent elements + let parentElements: HTMLElement[] = []; - if (selector === "td" && element.parentElement) { - const siblings = Array.from(element.parentElement.children); - const position = siblings.indexOf(element) + 1; - return `${selector}:nth-child(${position})`; - } + if (parentSelector.includes(">>")) { + // Handle shadow DOM + const selectorParts = parentSelector + .split(">>") + .map((part) => part.trim()); - const className = - typeof element.className === "string" ? element.className : ""; - if (className) { - const classes = className - .split(/\s+/) - .filter((cls: string) => Boolean(cls)); - if (classes.length > 0) { - const validClasses = classes.filter( - (cls: string) => !cls.startsWith("!") && !cls.includes(":") - ); - if (validClasses.length > 0) { - selector += - "." + validClasses.map((cls) => CSS.escape(cls)).join("."); + // Evaluate the first part with XPath + parentElements = this.evaluateXPath(selectorParts[0], iframeDoc); + + // Handle shadow DOM traversal + for (let i = 1; i < selectorParts.length; i++) { + const newParentElements: HTMLElement[] = []; + for (const element of parentElements) { + if (element.shadowRoot) { + const shadowChildren = this.evaluateXPath( + selectorParts[i], + element.shadowRoot as any + ); + newParentElements.push(...shadowChildren); } } + parentElements = newParentElements; } + } else { + // Use XPath evaluation directly for regular DOM + parentElements = this.evaluateXPath(parentSelector, iframeDoc); + } - if (element.parentElement) { - const siblings = Array.from(element.parentElement.children); - const elementClasses = Array.from(element.classList || []); + if (parentElements.length === 0) { + console.warn("No parent elements found for selector:", parentSelector); + return []; + } - const similarSiblings = siblings.filter((sibling) => { - if (sibling === element) return false; - const siblingClasses = Array.from(sibling.classList || []); - return siblingClasses.some((cls) => elementClasses.includes(cls)); - }); + const allChildSelectors = new Set(); - if (similarSiblings.length > 0) { - const position = siblings.indexOf(element) + 1; - selector += `:nth-child(${position})`; - } - } + parentElements.forEach((parentElement) => { + const childSelectors = this.generateAbsoluteChildXPaths( + parentElement, + parentSelector + ); + childSelectors.forEach((selector) => allChildSelectors.add(selector)); + }); - return selector; - } + // Convert Set back to array to get unique selectors + const childSelectors = Array.from(allChildSelectors); - // Function to generate selector path from an element to its parent - function getSelectorPath(element: HTMLElement): string { - if (!element || !element.parentElement) return ""; + return childSelectors; + } catch (error) { + console.error("Error in optimized getChildSelectors:", error); + return []; + } + }; - const elementSelector = getNonUniqueSelector(element); + private evaluateXPath( + xpath: string, + contextNode: Document | ShadowRoot + ): HTMLElement[] { + try { + const document = + contextNode instanceof ShadowRoot + ? (contextNode.host as HTMLElement).ownerDocument + : (contextNode as Document); + + const result = document.evaluate( + xpath, + contextNode as any, + null, + XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, + null + ); - // Check for shadow DOM context only (removed iframe/frame checks) - const rootNode = element.getRootNode(); - if (rootNode instanceof ShadowRoot) { - const hostSelector = getNonUniqueSelector( - rootNode.host as HTMLElement - ); - return `${hostSelector} >> ${elementSelector}`; + const elements: HTMLElement[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as HTMLElement); } + } + + return elements; + } catch (error) { + return this.fallbackXPathEvaluation(xpath, contextNode); + } + } - // REMOVED: iframe/frame context detection since we're already inside the iframe + private fallbackXPathEvaluation( + xpath: string, + contextNode: Document | ShadowRoot + ): HTMLElement[] { + try { + const simpleTagMatch = xpath.match(/^\/\/(\w+)$/); + if (simpleTagMatch) { + const tagName = simpleTagMatch[1]; + return Array.from( + contextNode.querySelectorAll(tagName) + ) as HTMLElement[]; + } - if ( - elementSelector.includes(".") && - elementSelector.split(".").length > 1 - ) { - return elementSelector; - } + const singleClassMatch = xpath.match( + /^\/\/(\w+)\[contains\(@class,'([^']+)'\)\]$/ + ); + if (singleClassMatch) { + const [, tagName, className] = singleClassMatch; + return Array.from( + contextNode.querySelectorAll(`${tagName}.${CSS.escape(className)}`) + ) as HTMLElement[]; + } - const parentSelector = getNonUniqueSelector(element.parentElement); - return `${parentSelector} > ${elementSelector}`; + const positionMatch = xpath.match(/^\/\/(\w+)\[(\d+)\]$/); + if (positionMatch) { + const [, tagName, position] = positionMatch; + return Array.from( + contextNode.querySelectorAll(`${tagName}:nth-child(${position})`) + ) as HTMLElement[]; } - // Function to get all children from special contexts (simplified for iframe environment) - function getSpecialContextChildren(element: HTMLElement): HTMLElement[] { - const children: HTMLElement[] = []; - - // Get shadow DOM children only - const shadowRoot = element.shadowRoot; - if (shadowRoot) { - const shadowElements = Array.from( - shadowRoot.querySelectorAll("*") - ) as HTMLElement[]; - children.push(...shadowElements); - } + console.warn("⚠️ Could not parse XPath pattern:", xpath); + return []; + } catch (error) { + console.error("❌ Fallback XPath evaluation also failed:", error); + return []; + } + } - // REMOVED: iframe and frame children logic since we're already inside the iframe - // If there are nested iframes/frames inside the DOM content, we don't need to traverse them - // for selector generation purposes within this context + private generateAbsoluteChildXPaths( + parentElement: HTMLElement, + listSelector: string + ): string[] { + const selectors: string[] = []; + const processedElements = new Set(); + + // More efficient traversal - use querySelectorAll to get all descendants at once + const allDescendants = Array.from( + parentElement.querySelectorAll("*") + ) as HTMLElement[]; + + allDescendants.forEach((descendant, index) => { + if (processedElements.has(descendant)) return; + processedElements.add(descendant); + + const absolutePath = this.buildAbsoluteXPath( + descendant, + listSelector, + parentElement + ); - return children; + if (absolutePath) { + selectors.push(absolutePath); } + }); + + // Handle shadow DOM descendants + const shadowElements = this.getShadowDOMDescendants(parentElement); + shadowElements.forEach((shadowElement) => { + const shadowPath = this.buildAbsoluteXPath( + shadowElement, + listSelector, + parentElement + ); + if (shadowPath) { + selectors.push(shadowPath); + } + }); - // Function to recursively get all descendant selectors - function getAllDescendantSelectors(element: HTMLElement): string[] { - let selectors: string[] = []; - - // Handle regular DOM children - const children = Array.from(element.children) as HTMLElement[]; - for (const child of children) { - const childPath = getSelectorPath(child); - if (childPath) { - selectors.push(childPath); - - // Process regular descendants - selectors = selectors.concat(getAllDescendantSelectors(child)); - - // Process special context children (only shadow DOM now) - const specialChildren = getSpecialContextChildren(child); - for (const specialChild of specialChildren) { - const specialPath = getSelectorPath(specialChild); - if (specialPath) { - selectors.push(specialPath); - selectors = selectors.concat( - getAllDescendantSelectors(specialChild) - ); - } - } - } - } + return selectors; + } - // Handle direct special context children - const specialChildren = getSpecialContextChildren(element); - for (const specialChild of specialChildren) { - const specialPath = getSelectorPath(specialChild); - if (specialPath) { - selectors.push(specialPath); - selectors = selectors.concat( - getAllDescendantSelectors(specialChild) - ); - } - } + private getShadowDOMDescendants(element: HTMLElement): HTMLElement[] { + const shadowDescendants: HTMLElement[] = []; + + const traverse = (el: HTMLElement) => { + if (el.shadowRoot) { + const shadowElements = Array.from( + el.shadowRoot.querySelectorAll("*") + ) as HTMLElement[]; + shadowDescendants.push(...shadowElements); - return selectors; + // Recursively check shadow elements for more shadow roots + shadowElements.forEach((shadowEl) => traverse(shadowEl)); } + }; - // Handle shadow DOM parent selectors (simplified) - let parentElements: HTMLElement[] = []; + traverse(element); + return shadowDescendants; + } - // Check for special context traversal in parent selector - if (parentSelector.includes(">>")) { - // Only handle shadow DOM delimiters (removed :>> iframe delimiter handling) - const selectorParts = parentSelector - .split(">>") - .map((part) => part.trim()); + private buildAbsoluteXPath( + targetElement: HTMLElement, + listSelector: string, + listElement: HTMLElement + ): string | null { + try { + // Start with the list selector as base + let xpath = listSelector; - // Start with initial elements - parentElements = Array.from( - iframeDoc.querySelectorAll(selectorParts[0]) - ) as HTMLElement[]; + // Build path from list element to target element + const pathFromList = this.getStructuralPath(targetElement, listElement); - // Traverse through parts (only shadow DOM) - for (let i = 1; i < selectorParts.length; i++) { - const newParentElements: HTMLElement[] = []; + if (!pathFromList) return null; - for (const element of parentElements) { - // Check for shadow DOM only - if (element.shadowRoot) { - const shadowChildren = Array.from( - element.shadowRoot.querySelectorAll(selectorParts[i]) - ) as HTMLElement[]; - newParentElements.push(...shadowChildren); - } + // Append the structural path to the list selector + return xpath + pathFromList; + } catch (error) { + console.error("Error building absolute XPath:", error); + return null; + } + } - // REMOVED: iframe, frame, and frameset traversal logic - } + private getStructuralPath( + targetElement: HTMLElement, + rootElement: HTMLElement + ): string | null { + if (!rootElement.contains(targetElement) || targetElement === rootElement) { + return null; + } - parentElements = newParentElements; - } - } else { - // Regular DOM selector - parentElements = Array.from( - iframeDoc.querySelectorAll(parentSelector) - ) as HTMLElement[]; + const pathParts: string[] = []; + let current = targetElement; + + // Build path from target up to root + while (current && current !== rootElement && current.parentElement) { + const pathPart = this.generateStructuralStep(current); + if (pathPart) { + pathParts.unshift(pathPart); } + current = current.parentElement; + } - const allChildSelectors = new Set(); // Use a set to ensure uniqueness + return pathParts.length > 0 ? "/" + pathParts.join("/") : null; + } - // Process each parent element and its descendants - parentElements.forEach((parentElement) => { - const descendantSelectors = getAllDescendantSelectors(parentElement); - descendantSelectors.forEach((selector) => - allChildSelectors.add(selector) - ); - }); + private generateStructuralStep(element: HTMLElement): string { + const tagName = element.tagName.toLowerCase(); - return Array.from(allChildSelectors); - } catch (error) { - console.error("Error in getChildSelectors:", error); - return []; + if (!element.parentElement) { + return tagName; } - }; + + // Get all sibling elements with the same tag name + const siblings = Array.from(element.parentElement.children).filter( + (sibling) => sibling.tagName === element.tagName + ); + + if (siblings.length === 1) { + // Only one element with this tag - no position needed + return tagName; + } else { + // Multiple elements with same tag - use position + const position = siblings.indexOf(element) + 1; + return `${tagName}[${position}]`; + } + } private getBestSelectorForAction = (action: Action) => { switch (action.type) { @@ -3111,18 +3114,41 @@ class ClientSelectorGenerator { return null; }; + /** + * Enhanced highlighting that detects and highlights entire groups + */ public generateDataForHighlighter( coordinates: Coordinates, iframeDocument: Document, - isDOMMode: boolean = true + isDOMMode: boolean = true, + cachedChildSelectors: string[] = [] ): { rect: DOMRect; selector: string; elementInfo: ElementInfo | null; childSelectors?: string[]; + groupInfo?: { + isGroupElement: boolean; + groupSize: number; + groupElements: HTMLElement[]; + groupFingerprint: ElementFingerprint; + }; } | null { try { - // Use instance variables instead of parameters + if (this.getList === true) { + this.analyzeElementGroups(iframeDocument); + } + + const elementAtPoint = this.findGroupedContainerAtPoint( + coordinates.x, + coordinates.y, + iframeDocument + ); + if (!elementAtPoint) return null; + + const elementGroup = this.getElementGroup(elementAtPoint); + const isGroupElement = elementGroup !== null; + const rect = this.getRect( iframeDocument, coordinates, @@ -3130,11 +3156,7 @@ class ClientSelectorGenerator { this.getList, isDOMMode ); - const displaySelector = this.generateSelector( - iframeDocument, - coordinates, - ActionType.Click - ); + const elementInfo = this.getElementInformation( iframeDocument, coordinates, @@ -3142,41 +3164,337 @@ class ClientSelectorGenerator { this.getList ); - if (!rect || !elementInfo || !displaySelector) { + if (!rect || !elementInfo) { return null; } - const highlighterData = { + let displaySelector: string | null; + let childSelectors: string[] = []; + + if (this.getList === true && this.listSelector !== "") { + childSelectors = + cachedChildSelectors.length > 0 + ? cachedChildSelectors + : this.getChildSelectors(iframeDocument, this.listSelector); + } + + if (isGroupElement && this.getList === true && this.listSelector === "") { + displaySelector = this.generateGroupContainerSelector(elementGroup!); + + return { + rect, + selector: displaySelector, + elementInfo, + groupInfo: { + isGroupElement: true, + groupSize: elementGroup!.elements.length, + groupElements: elementGroup!.elements, + groupFingerprint: elementGroup!.fingerprint, + }, + }; + } else if ( + this.getList === true && + this.listSelector !== "" && + childSelectors.length > 0 && + this.paginationMode === false + ) { + // For child elements within a list, find the matching absolute XPath + displaySelector = this.findMatchingAbsoluteXPath( + elementAtPoint, + childSelectors, + this.listSelector, + iframeDocument + ); + } else { + // Fall back to regular selector generation for non-list elements + displaySelector = this.generateSelector( + iframeDocument, + coordinates, + ActionType.Click + ); + } + + if (!displaySelector) { + return null; + } + + return { rect, selector: displaySelector, elementInfo, - shadowInfo: elementInfo?.isShadowRoot + childSelectors: childSelectors.length > 0 ? childSelectors : undefined, + groupInfo: isGroupElement ? { - mode: elementInfo.shadowRootMode, - content: elementInfo.shadowRootContent, + isGroupElement: true, + groupSize: elementGroup!.elements.length, + groupElements: elementGroup!.elements, + groupFingerprint: elementGroup!.fingerprint, } - : null, + : undefined, }; + } catch (error) { + console.error("Error generating highlighter data:", error); + return null; + } + } - if (this.getList === true) { - if (this.listSelector !== "") { - const childSelectors = this.getChildSelectors( - iframeDocument, - this.listSelector - ); - return { ...highlighterData, childSelectors }; - } else { - return highlighterData; - } - } else { - return highlighterData; + private findMatchingAbsoluteXPath( + targetElement: HTMLElement, + childSelectors: string[], + listSelector: string, + iframeDocument: Document + ): string | null { + try { + // Use XPath evaluation directly instead of CSS conversion + const parentElements = this.evaluateXPath(listSelector, iframeDocument); + + const containingParent = parentElements.find((parent) => + parent.contains(targetElement) + ); + + if (!containingParent) { + console.warn("Could not find containing parent for target element"); + return null; + } + + // Get the structural path from parent to target + const structuralPath = this.getStructuralPath( + targetElement, + containingParent + ); + + if (!structuralPath) { + console.warn("Could not determine structural path"); + return null; + } + + // Construct the absolute XPath + const absoluteXPath = listSelector + structuralPath; + + // Check if this XPath exists in our child selectors + const matchingSelector = childSelectors.find( + (selector) => + selector === absoluteXPath || + this.isEquivalentXPath(selector, absoluteXPath) + ); + + if (matchingSelector) { + return matchingSelector; + } + + // If no exact match, find the closest matching selector + const closestMatch = this.findClosestXPathMatch( + absoluteXPath, + childSelectors + ); + + if (closestMatch) { + return closestMatch; } + + return absoluteXPath; } catch (error) { - console.error("Error generating highlighter data:", error); + console.error("Error finding matching absolute XPath:", error); return null; } } + private isEquivalentXPath(xpath1: string, xpath2: string): boolean { + // Normalize both XPaths for comparison + const normalize = (xpath: string) => { + return xpath + .replace(/\s+/g, " ") // Normalize whitespace + .replace( + /\[\s*contains\s*\(\s*@class\s*,\s*'([^']+)'\s*\)\s*\]/g, + "[contains(@class,'$1')]" + ) // Normalize class predicates + .trim(); + }; + + return normalize(xpath1) === normalize(xpath2); + } + + private findClosestXPathMatch( + targetXPath: string, + candidateSelectors: string[] + ): string | null { + // Extract the path components for comparison + const getPathComponents = (xpath: string) => { + // Remove the list selector prefix and get just the relative path + const pathMatch = xpath.match(/\/([^\/].*)$/); + return pathMatch ? pathMatch[1].split("/") : []; + }; + + const targetComponents = getPathComponents(targetXPath); + + let bestMatch = null; + let bestScore = 0; + + for (const selector of candidateSelectors) { + const selectorComponents = getPathComponents(selector); + + // Calculate similarity score + const commonLength = Math.min( + targetComponents.length, + selectorComponents.length + ); + let score = 0; + + for (let i = 0; i < commonLength; i++) { + if (targetComponents[i] === selectorComponents[i]) { + score++; + } else { + // Check if they're the same tag with different positions + const targetTag = targetComponents[i].replace(/\[\d+\]/, ""); + const selectorTag = selectorComponents[i].replace(/\[\d+\]/, ""); + if (targetTag === selectorTag) { + score += 0.5; // Partial match for same tag + } + break; // Stop at first mismatch + } + } + + if (score > bestScore) { + bestScore = score; + bestMatch = selector; + } + } + + // Only return a match if we have reasonable confidence + return bestScore >= targetComponents.length * 0.7 ? bestMatch : null; + } + + /** + * Generate XPath that matches ALL group elements and ONLY group elements + */ + private generateGroupContainerSelector(group: ElementGroup): string { + const { elements } = group; + + if (!elements || elements.length === 0) return ""; + + // 1. Tag name (ensure all tags match first) + const tagName = elements[0].tagName.toLowerCase(); + if (!elements.every((el) => el.tagName.toLowerCase() === tagName)) { + throw new Error("Inconsistent tag names in group."); + } + + let xpath = `//${tagName}`; + const predicates: string[] = []; + + // 2. Get common classes + const commonClasses = this.getCommonStrings( + elements.map((el) => + (el.getAttribute("class") || "").split(/\s+/).filter(Boolean) + ) + ); + if (commonClasses.length > 0) { + predicates.push( + ...commonClasses.map((cls) => `contains(@class, '${cls}')`) + ); + } + + // 3. Get common attributes (excluding id, style, dynamic ones) + const commonAttributes = this.getCommonAttributes(elements, [ + "id", + "style", + ]); + for (const [attr, value] of Object.entries(commonAttributes)) { + predicates.push(`@${attr}='${value}'`); + } + + // 4. Optional: Common child count + const childrenCountSet = new Set(elements.map((el) => el.children.length)); + if (childrenCountSet.size === 1) { + predicates.push(`count(*)=${[...childrenCountSet][0]}`); + } + + // 5. Build XPath + if (predicates.length > 0) { + xpath += `[${predicates.join(" and ")}]`; + } + + // 6. Post-validate that XPath matches all elements + const matched = document.evaluate( + xpath, + document, + null, + XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, + null + ); + const matchedSet = new Set(); + for (let i = 0; i < matched.snapshotLength; i++) { + matchedSet.add(matched.snapshotItem(i) as HTMLElement); + } + + return xpath; + } + + // Returns intersection of strings + private getCommonStrings(lists: string[][]): string[] { + return lists.reduce((acc, list) => + acc.filter((item) => list.includes(item)) + ); + } + + // Returns common attribute key-value pairs across elements + private getCommonAttributes( + elements: Element[], + excludeAttrs: string[] = [] + ): Record { + if (elements.length === 0) return {}; + + const firstEl = elements[0]; + const attrMap: Record = {}; + + for (const attr of Array.from(firstEl.attributes)) { + if (excludeAttrs.includes(attr.name)) continue; + attrMap[attr.name] = attr.value; + } + + for (let i = 1; i < elements.length; i++) { + for (const name of Object.keys(attrMap)) { + const val = elements[i].getAttribute(name); + if (val !== attrMap[name]) { + delete attrMap[name]; // remove if mismatch + } + } + } + + return attrMap; + } + + /** + * Get deepest element from a list of elements + */ + private getDeepestElementFromPoint( + elements: HTMLElement[] + ): HTMLElement | null { + if (!elements.length) return null; + if (elements.length === 1) return elements[0]; + + let deepestElement = elements[0]; + let maxDepth = 0; + + for (const element of elements) { + const depth = this.getElementDepth(element); + if (depth > maxDepth) { + maxDepth = depth; + deepestElement = element; + } + } + + return deepestElement; + } + + /** + * Clean up when component unmounts or mode changes + */ + public cleanup(): void { + this.elementGroups.clear(); + this.groupedElements.clear(); + this.lastAnalyzedDocument = null; + } + // Update generateSelector to use instance variables public generateSelector( iframeDocument: Document,