Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
240 changes: 185 additions & 55 deletions maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -262,77 +262,207 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
const scrapedData = [];
// Helper function to extract values from elements
function extractValue(element, attribute) {
if (!element) return null;

if (attribute === 'innerText') {
return element.innerText.trim();
} else if (attribute === 'innerHTML') {
return element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
const attrValue = element.getAttribute(attribute);
return attrValue ? new URL(attrValue, window.location.origin).href : null;
}
return element.getAttribute(attribute);
}
Comment on lines +265 to +278
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add error handling for URL construction

The URL construction could throw errors for invalid URLs. Consider adding try-catch block to handle malformed URLs gracefully.

 } else if (attribute === 'src' || attribute === 'href') {
     const attrValue = element.getAttribute(attribute);
-    return attrValue ? new URL(attrValue, window.location.origin).href : null;
+    if (!attrValue) return null;
+    try {
+        return new URL(attrValue, window.location.origin).href;
+    } catch (error) {
+        console.warn(`Invalid URL: ${attrValue}`);
+        return null;
+    }
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// Helper function to extract values from elements
function extractValue(element, attribute) {
if (!element) return null;
if (attribute === 'innerText') {
return element.innerText.trim();
} else if (attribute === 'innerHTML') {
return element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
const attrValue = element.getAttribute(attribute);
return attrValue ? new URL(attrValue, window.location.origin).href : null;
}
return element.getAttribute(attribute);
}
// Helper function to extract values from elements
function extractValue(element, attribute) {
if (!element) return null;
if (attribute === 'innerText') {
return element.innerText.trim();
} else if (attribute === 'innerHTML') {
return element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
const attrValue = element.getAttribute(attribute);
if (!attrValue) return null;
try {
return new URL(attrValue, window.location.origin).href;
} catch (error) {
console.warn(`Invalid URL: ${attrValue}`);
return null;
}
}
return element.getAttribute(attribute);
}


while (scrapedData.length < limit) {
let parentElements = Array.from(document.querySelectorAll(listSelector));
// Helper function to find table ancestors
function findTableAncestor(element) {
let currentElement = element;
const MAX_DEPTH = 5;
let depth = 0;

// If we only got one element or none, try a more generic approach
if (limit > 1 && parentElements.length <= 1) {
const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
const container = document.querySelector(containerSelector);
while (currentElement && depth < MAX_DEPTH) {
if (currentElement.tagName === 'TD') {
return { type: 'TD', element: currentElement };
} else if (currentElement.tagName === 'TR') {
return { type: 'TR', element: currentElement };
}
currentElement = currentElement.parentElement;
depth++;
}
return null;
}

function getCellIndex(td) {
let index = 0;
let sibling = td;
while (sibling = sibling.previousElementSibling) {
index++;
}
return index;
}
Comment on lines +298 to +305
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Refactor the while loop condition

The assignment in the while condition was flagged by static analysis. Consider restructuring for better readability.

 function getCellIndex(td) {
     let index = 0;
     let sibling = td;
-    while (sibling = sibling.previousElementSibling) {
+    while (sibling.previousElementSibling) {
+        sibling = sibling.previousElementSibling;
         index++;
     }
     return index;
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
function getCellIndex(td) {
let index = 0;
let sibling = td;
while (sibling = sibling.previousElementSibling) {
index++;
}
return index;
}
function getCellIndex(td) {
let index = 0;
let sibling = td;
while (sibling.previousElementSibling) {
sibling = sibling.previousElementSibling;
index++;
}
return index;
}
🧰 Tools
🪛 Biome (1.9.4)

[error] 301-301: The assignment should not be in an expression.

The use of assignments in expressions is confusing.
Expressions are often considered as side-effect free.

(lint/suspicious/noAssignInExpressions)


function hasThElement(row, tableFields) {
for (const [label, { selector }] of Object.entries(tableFields)) {
const element = row.querySelector(selector);
if (element) {
let current = element;
while (current && current !== row) {
if (current.tagName === 'TH') {
return true;
}
current = current.parentElement;
}
}
}
return false;
}

function filterRowsBasedOnTag(rows, tableFields) {
for (const row of rows) {
if (hasThElement(row, tableFields)) {
return rows;
}
}
return rows.filter(row => row.getElementsByTagName('TH').length === 0);
}

// Get all containers that match the listSelector
const containers = Array.from(document.querySelectorAll(listSelector));
if (containers.length === 0) return [];

// Initialize arrays to store field classifications for each container
const containerFields = containers.map(() => ({
tableFields: {},
nonTableFields: {}
}));

// Analyze field types for each container
containers.forEach((container, containerIndex) => {
for (const [label, field] of Object.entries(fields)) {
const sampleElement = container.querySelector(field.selector);

if (container) {
const allChildren = Array.from(container.children);

const firstMatch = document.querySelector(listSelector);
if (firstMatch) {
// Get classes from the first matching element
const firstMatchClasses = Array.from(firstMatch.classList);

// Find similar elements by matching most of their classes
parentElements = allChildren.filter(element => {
const elementClasses = Array.from(element.classList);

// Element should share at least 70% of classes with the first match
const commonClasses = firstMatchClasses.filter(cls =>
elementClasses.includes(cls));
return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
});
if (sampleElement) {
const ancestor = findTableAncestor(sampleElement);
if (ancestor) {
containerFields[containerIndex].tableFields[label] = {
...field,
tableContext: ancestor.type,
cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1
};
} else {
containerFields[containerIndex].nonTableFields[label] = field;
}
} else {
containerFields[containerIndex].nonTableFields[label] = field;
}
}
});

const scrapedData = [];

// Process each container
containers.forEach((container, containerIndex) => {
const { tableFields, nonTableFields } = containerFields[containerIndex];

// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};

// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);

if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
// Handle relative 'src' URLs
const src = fieldElement.getAttribute('src');
record[label] = src ? new URL(src, window.location.origin).href : null;
} else if (attribute === 'href') {
// Handle relative 'href' URLs
const href = fieldElement.getAttribute('href');
record[label] = href ? new URL(href, window.location.origin).href : null;
} else {
record[label] = fieldElement.getAttribute(attribute);
// Handle table fields
if (Object.keys(tableFields).length > 0) {
// Find the common table ancestor
const firstField = Object.values(tableFields)[0];
const firstElement = container.querySelector(firstField.selector);
let tableContext = firstElement;

while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
tableContext = tableContext.parentElement;
}

if (tableContext) {
const rows = Array.from(tableContext.getElementsByTagName('TR'));
const processedRows = filterRowsBasedOnTag(rows, tableFields);

for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
const record = {};
const currentRow = processedRows[rowIndex];

for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
let element = null;

if (cellIndex >= 0) {
const td = currentRow.children[cellIndex];
if (td) {
element = td.querySelector(selector);

if (!element && selector.split(">").pop().includes('td:nth-child')) {
element = td;
}

if (!element) {
const tagOnlySelector = selector.split('.')[0];
element = td.querySelector(tagOnlySelector);
}

if (!element) {
let currentElement = td;
while (currentElement && currentElement.children.length > 0) {
let foundContentChild = false;
for (const child of currentElement.children) {
if (extractValue(child, attribute)) {
currentElement = child;
foundContentChild = true;
break;
}
}
if (!foundContentChild) break;
}
element = currentElement;
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add depth limit to content child traversal

The deep traversal for finding content children lacks a depth limit, which could lead to performance issues with deeply nested tables.

+const MAX_CONTENT_SEARCH_DEPTH = 3;
+let depth = 0;
 let currentElement = td;
-while (currentElement && currentElement.children.length > 0) {
+while (currentElement && currentElement.children.length > 0 && depth < MAX_CONTENT_SEARCH_DEPTH) {
     let foundContentChild = false;
     for (const child of currentElement.children) {
         if (extractValue(child, attribute)) {
             currentElement = child;
             foundContentChild = true;
             break;
         }
     }
     if (!foundContentChild) break;
+    depth++;
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
let currentElement = td;
while (currentElement && currentElement.children.length > 0) {
let foundContentChild = false;
for (const child of currentElement.children) {
if (extractValue(child, attribute)) {
currentElement = child;
foundContentChild = true;
break;
}
}
if (!foundContentChild) break;
}
element = currentElement;
}
const MAX_CONTENT_SEARCH_DEPTH = 3;
let depth = 0;
let currentElement = td;
while (currentElement && currentElement.children.length > 0 && depth < MAX_CONTENT_SEARCH_DEPTH) {
let foundContentChild = false;
for (const child of currentElement.children) {
if (extractValue(child, attribute)) {
currentElement = child;
foundContentChild = true;
break;
}
}
if (!foundContentChild) break;
depth++;
}
element = currentElement;
}

}
} else {
element = currentRow.querySelector(selector);
}

if (element) {
record[label] = extractValue(element, attribute);
}
}

if (Object.keys(record).length > 0) {
scrapedData.push(record);
}
}
}
scrapedData.push(record);
}

// If we've processed all available elements and still haven't reached the limit,
// break to avoid infinite loop
if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
break;
// Handle non-table fields
if (Object.keys(nonTableFields).length > 0) {
const firstField = Object.values(nonTableFields)[0];
const baseElements = Array.from(container.querySelectorAll(firstField.selector));

for (let i = 0; i < Math.min(baseElements.length, limit); i++) {
const record = {};

for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
const elements = Array.from(parent.querySelectorAll(selector));
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix undefined variable reference

The code uses an undefined parent variable instead of the available container variable.

-const elements = Array.from(parent.querySelectorAll(selector));
+const elements = Array.from(container.querySelectorAll(selector));
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const elements = Array.from(parent.querySelectorAll(selector));
const elements = Array.from(container.querySelectorAll(selector));

// Use the same index to maintain correspondence between fields
const element = elements[i];

if (element) {
record[label] = extractValue(element, attribute);
}
}

if (Object.keys(record).length > 0) {
scrapedData.push(record);
}
}
}
}
});

return scrapedData;
};


/**
* Gets all children of the elements matching the listSelector,
* returning their CSS selectors and innerText.
Expand Down
61 changes: 58 additions & 3 deletions server/src/workflow-management/selector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ export const getElementInformation = async (
let element = originalEl;

while (element.parentElement) {
if (element.tagName.toLowerCase() === 'body' ||
element.tagName.toLowerCase() === 'html') {
break;
}

const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();

Expand All @@ -99,7 +104,14 @@ export const getElementInformation = async (
(parentRect.width * parentRect.height) > 0.5;

if (fullyContained && significantOverlap) {
element = element.parentElement;
// Only traverse up if next parent isn't body or html
const nextParent = element.parentElement;
if (nextParent.tagName.toLowerCase() !== 'body' &&
nextParent.tagName.toLowerCase() !== 'html') {
element = nextParent;
} else {
break;
}
} else {
break;
}
Expand Down Expand Up @@ -201,6 +213,11 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector
let element = originalEl;

while (element.parentElement) {
if (element.tagName.toLowerCase() === 'body' ||
element.tagName.toLowerCase() === 'html') {
break;
}

const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();

Expand All @@ -215,7 +232,14 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector
(parentRect.width * parentRect.height) > 0.5;

if (fullyContained && significantOverlap) {
element = element.parentElement;
// Only traverse up if next parent isn't body or html
const nextParent = element.parentElement;
if (nextParent.tagName.toLowerCase() !== 'body' &&
nextParent.tagName.toLowerCase() !== 'html') {
element = nextParent;
} else {
break;
}
} else {
break;
}
Expand Down Expand Up @@ -869,6 +893,13 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();

if (selector === 'td' && element.parentElement) {
// Find position among td siblings
const siblings = Array.from(element.parentElement.children);
const position = siblings.indexOf(element) + 1;
return `${selector}:nth-child(${position})`;
}

if (element.className) {
const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls));
if (classes.length > 0) {
Expand Down Expand Up @@ -904,6 +935,11 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates

// if (listSelector === '') {
while (element.parentElement) {
if (element.tagName.toLowerCase() === 'body' ||
element.tagName.toLowerCase() === 'html') {
break;
}

const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();

Expand All @@ -918,7 +954,14 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
(parentRect.width * parentRect.height) > 0.5;

if (fullyContained && significantOverlap) {
element = element.parentElement;
// Only traverse up if next parent isn't body or html
const nextParent = element.parentElement;
if (nextParent.tagName.toLowerCase() !== 'body' &&
nextParent.tagName.toLowerCase() !== 'html') {
element = nextParent;
} else {
break;
}
} else {
break;
}
Expand All @@ -937,6 +980,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();

if (selector === 'td' && element.parentElement) {
const siblings = Array.from(element.parentElement.children);
const position = siblings.indexOf(element) + 1;
return `${selector}:nth-child(${position})`;
}

Comment on lines +1007 to +1012
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Consolidate duplicate table cell selector logic.

The table cell selector generation logic is duplicated in three different places. This makes the code harder to maintain and increases the risk of inconsistencies.

Extract this logic into a shared utility function:

+const getTableCellSelector = (element: HTMLElement): string | null => {
+  if (element.tagName.toLowerCase() !== 'td' || !element.parentElement) {
+    return null;
+  }
+  if (element.parentElement.tagName !== 'TR') {
+    console.warn('Table cell found outside of a table row');
+    return null;
+  }
+  const siblings = Array.from(element.parentElement.children);
+  const position = siblings.indexOf(element) + 1;
+  return `td:nth-child(${position})`;
+};

 // Replace all occurrences with:
-if (selector === 'td' && element.parentElement) {
-  const siblings = Array.from(element.parentElement.children);
-  const position = siblings.indexOf(element) + 1;
-  return `${selector}:nth-child(${position})`;
-}
+const cellSelector = getTableCellSelector(element);
+if (cellSelector) {
+  return cellSelector;
+}

Also applies to: 1067-1072

if (element.className) {
const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls));
if (classes.length > 0) {
Expand Down Expand Up @@ -991,6 +1040,12 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();

if (selector === 'td' && element.parentElement) {
const siblings = Array.from(element.parentElement.children);
const position = siblings.indexOf(element) + 1;
return `${selector}:nth-child(${position})`;
}

const className = typeof element.className === 'string' ? element.className : '';
if (className) {
const classes = className.split(/\s+/).filter((cls: string) => Boolean(cls));
Expand Down