Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 61 additions & 51 deletions maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -262,73 +262,83 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
// Separate fields into table and non-table categories
const tableFields = {};
const nonTableFields = {};

for (const [label, field] of Object.entries(fields)) {
if (['TD', 'TH', 'TR'].includes(field.tag)) {
tableFields[label] = field;
} else {
nonTableFields[label] = field;
}
}

const parentElements = Array.from(document.querySelectorAll(listSelector));
const scrapedData = [];

while (scrapedData.length < limit) {
let parentElements = Array.from(document.querySelectorAll(listSelector));

// If we only got one element or none, try a more generic approach
if (limit > 1 && parentElements.length <= 1) {
const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
const container = document.querySelector(containerSelector);
for (const parent of parentElements) {
// First, get the number of rows we'll need by checking the first table field
const firstTableField = Object.values(tableFields)[0];
const tableRows = firstTableField
? Array.from(parent.querySelectorAll(firstTableField.selector)).slice(0, limit)
: [null];

tableRows.forEach((_, rowIndex) => {
const record = {};

if (container) {
const allChildren = Array.from(container.children);
// Table fields
for (const [label, { selector, attribute }] of Object.entries(tableFields)) {
const elements = Array.from(parent.querySelectorAll(selector));
const element = elements[rowIndex];

const firstMatch = document.querySelector(listSelector);
if (firstMatch) {
// Get classes from the first matching element
const firstMatchClasses = Array.from(firstMatch.classList);

// Find similar elements by matching most of their classes
parentElements = allChildren.filter(element => {
const elementClasses = Array.from(element.classList);

// Element should share at least 70% of classes with the first match
const commonClasses = firstMatchClasses.filter(cls =>
elementClasses.includes(cls));
return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
});
if (element) {
let value;
if (attribute === 'innerText') {
value = element.innerText.trim();
} else if (attribute === 'innerHTML') {
value = element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
const attrValue = element.getAttribute(attribute);
value = attrValue ? new URL(attrValue, window.location.origin).href : null;
} else {
value = element.getAttribute(attribute);
}
record[label] = value;
}
}
}

// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};

// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);

if (fieldElement) {

// Non table fields
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
const element = parent.querySelector(selector);

if (element) {
let value;
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
value = element.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
// Handle relative 'src' URLs
const src = fieldElement.getAttribute('src');
record[label] = src ? new URL(src, window.location.origin).href : null;
} else if (attribute === 'href') {
// Handle relative 'href' URLs
const href = fieldElement.getAttribute('href');
record[label] = href ? new URL(href, window.location.origin).href : null;
value = element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
const attrValue = element.getAttribute(attribute);
value = attrValue ? new URL(attrValue, window.location.origin).href : null;
} else {
record[label] = fieldElement.getAttribute(attribute);
value = element.getAttribute(attribute);
}
record[label] = value;
}
}
scrapedData.push(record);
}

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Enhance non-table field queries
Note that using parent.querySelector may only fetch the first matching element. If multiple elements match the same selector, subsequent ones will be ignored. Consider using querySelectorAll or clarifying that only the first match is necessary.

-for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
-    const element = parent.querySelector(selector);
+for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
+    const elements = Array.from(parent.querySelectorAll(selector));
+    if (elements.length === 0) {
+        continue;
+    }
+    const element = elements[0]; // or handle all elements

Committable suggestion skipped: line range outside the PR's diff.

if (Object.keys(record).length > 0) {
scrapedData.push(record);
}
});

// If we've processed all available elements and still haven't reached the limit,
// break to avoid infinite loop
if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
if (scrapedData.length >= limit) {
scrapedData.length = limit;
break;
}
}

return scrapedData;
};

Expand Down
61 changes: 58 additions & 3 deletions server/src/workflow-management/selector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ export const getElementInformation = async (
let element = originalEl;

while (element.parentElement) {
if (element.tagName.toLowerCase() === 'body' ||
element.tagName.toLowerCase() === 'html') {
break;
}

const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();

Expand All @@ -99,7 +104,14 @@ export const getElementInformation = async (
(parentRect.width * parentRect.height) > 0.5;

if (fullyContained && significantOverlap) {
element = element.parentElement;
// Only traverse up if next parent isn't body or html
const nextParent = element.parentElement;
if (nextParent.tagName.toLowerCase() !== 'body' &&
nextParent.tagName.toLowerCase() !== 'html') {
element = nextParent;
} else {
break;
}
} else {
break;
}
Expand Down Expand Up @@ -201,6 +213,11 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector
let element = originalEl;

while (element.parentElement) {
if (element.tagName.toLowerCase() === 'body' ||
element.tagName.toLowerCase() === 'html') {
break;
}

const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();

Expand All @@ -215,7 +232,14 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector
(parentRect.width * parentRect.height) > 0.5;

if (fullyContained && significantOverlap) {
element = element.parentElement;
// Only traverse up if next parent isn't body or html
const nextParent = element.parentElement;
if (nextParent.tagName.toLowerCase() !== 'body' &&
nextParent.tagName.toLowerCase() !== 'html') {
element = nextParent;
} else {
break;
}
} else {
break;
}
Expand Down Expand Up @@ -869,6 +893,13 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();

if (selector === 'td' && element.parentElement) {
// Find position among td siblings
const siblings = Array.from(element.parentElement.children);
const position = siblings.indexOf(element) + 1;
return `${selector}:nth-child(${position})`;
}

if (element.className) {
const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls));
if (classes.length > 0) {
Expand Down Expand Up @@ -904,6 +935,11 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates

// if (listSelector === '') {
while (element.parentElement) {
if (element.tagName.toLowerCase() === 'body' ||
element.tagName.toLowerCase() === 'html') {
break;
}

const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();

Expand All @@ -918,7 +954,14 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
(parentRect.width * parentRect.height) > 0.5;

if (fullyContained && significantOverlap) {
element = element.parentElement;
// Only traverse up if next parent isn't body or html
const nextParent = element.parentElement;
if (nextParent.tagName.toLowerCase() !== 'body' &&
nextParent.tagName.toLowerCase() !== 'html') {
element = nextParent;
} else {
break;
}
} else {
break;
}
Expand All @@ -937,6 +980,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();

if (selector === 'td' && element.parentElement) {
const siblings = Array.from(element.parentElement.children);
const position = siblings.indexOf(element) + 1;
return `${selector}:nth-child(${position})`;
}

Comment on lines +1007 to +1012
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Consolidate duplicate table cell selector logic.

The table cell selector generation logic is duplicated in three different places. This makes the code harder to maintain and increases the risk of inconsistencies.

Extract this logic into a shared utility function:

+const getTableCellSelector = (element: HTMLElement): string | null => {
+  if (element.tagName.toLowerCase() !== 'td' || !element.parentElement) {
+    return null;
+  }
+  if (element.parentElement.tagName !== 'TR') {
+    console.warn('Table cell found outside of a table row');
+    return null;
+  }
+  const siblings = Array.from(element.parentElement.children);
+  const position = siblings.indexOf(element) + 1;
+  return `td:nth-child(${position})`;
+};

 // Replace all occurrences with:
-if (selector === 'td' && element.parentElement) {
-  const siblings = Array.from(element.parentElement.children);
-  const position = siblings.indexOf(element) + 1;
-  return `${selector}:nth-child(${position})`;
-}
+const cellSelector = getTableCellSelector(element);
+if (cellSelector) {
+  return cellSelector;
+}

Also applies to: 1067-1072

if (element.className) {
const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls));
if (classes.length > 0) {
Expand Down Expand Up @@ -991,6 +1040,12 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();

if (selector === 'td' && element.parentElement) {
const siblings = Array.from(element.parentElement.children);
const position = siblings.indexOf(element) + 1;
return `${selector}:nth-child(${position})`;
}

const className = typeof element.className === 'string' ? element.className : '';
if (className) {
const classes = className.split(/\s+/).filter((cls: string) => Boolean(cls));
Expand Down